Note about icu_I18N.h being unstable
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
index efcd033..457f767 100644 (file)
@@ -1,76 +1,90 @@
-/* $Id: icu_I18N.h,v 1.1 2007-10-22 12:21:39 adam Exp $
-   Copyright (c) 2006-2007, Index Data.
-
-   This file is part of Pazpar2.
-
-   Pazpar2 is free software; you can redistribute it and/or modify it under
-   the terms of the GNU General Public License as published by the Free
-   Software Foundation; either version 2, or (at your option) any later
-   version.
-
-   Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-   for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with Pazpar2; see the file LICENSE.  If not, write to the
-   Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-   02111-1307, USA.
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2009 Index Data.
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Index Data nor the names of its contributors
+ *       may be used to endorse or promote products derived from this
+ *       software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Internal header for ICU utilities
+
+    These functions, while non-static, are considered unstable and internal
+    and may be renamed for each YAZ release.
 */
 
 #ifndef ICU_I18NL_H
 #define ICU_I18NL_H
 
 */
 
 #ifndef ICU_I18NL_H
 #define ICU_I18NL_H
 
-#include <yaz/nmem.h>
-
-#include <libxml/parser.h>
-#include <libxml/tree.h>
+#include <yaz/yconfig.h>
 
 #include <unicode/utypes.h>   /* Basic ICU data types */
 #include <unicode/uchar.h>    /* char names           */
 
 
 #include <unicode/utypes.h>   /* Basic ICU data types */
 #include <unicode/uchar.h>    /* char names           */
 
-//#include <unicode/ustdio.h>
 #include <unicode/ucol.h> 
 #include <unicode/ucol.h> 
-//#include <unicode/ucnv.h>     /* C   Converter API    */
-//#include <unicode/ustring.h>  /* some more string fcns*/
-//#include <unicode/uloc.h>
 #include <unicode/ubrk.h>
 #include <unicode/ubrk.h>
-//#include <unicode/unistr.h>
 #include <unicode/utrans.h>
 
 #include <unicode/utrans.h>
 
+#include <yaz/icu.h>
 
 
-
-// declared structs and functions
+/* declared structs and functions */
 
 int icu_check_status (UErrorCode status);
 
 struct icu_buf_utf16
 {
 
 int icu_check_status (UErrorCode status);
 
 struct icu_buf_utf16
 {
-  UChar * utf16;
-  int32_t utf16_len;
-  int32_t utf16_cap;
+    UChar * utf16;
+    int32_t utf16_len;
+    int32_t utf16_cap;
 };
 
 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
 };
 
 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
+
+struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
+
 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
                                             size_t capacity);
 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
                                             size_t capacity);
+
 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
                                           struct icu_buf_utf16 * src16);
 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
                                           struct icu_buf_utf16 * src16);
+
 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
 
 
 
 struct icu_buf_utf8
 {
 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
 
 
 
 struct icu_buf_utf8
 {
-  uint8_t * utf8;
-  int32_t utf8_len;
-  int32_t utf8_cap;
+    uint8_t * utf8;
+    int32_t utf8_len;
+    int32_t utf8_cap;
 };
 
 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
 };
 
 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
+
+struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8);
+
 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
                                           size_t capacity);
 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
                                           size_t capacity);
+
 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
 
 
 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
 
 
@@ -89,46 +103,46 @@ UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 
 struct icu_casemap
 {
 
 struct icu_casemap
 {
-  char locale[16];
-  char action;
+    char action;
 };
 
 };
 
-struct icu_casemap * icu_casemap_create(const char *locale, char action,
-                                            UErrorCode *status);
+struct icu_casemap * icu_casemap_create(char action, UErrorCode *status);
 
 void icu_casemap_destroy(struct icu_casemap * casemap);
 
 int icu_casemap_casemap(struct icu_casemap * casemap,
                         struct icu_buf_utf16 * dest16,
                         struct icu_buf_utf16 * src16,
 
 void icu_casemap_destroy(struct icu_casemap * casemap);
 
 int icu_casemap_casemap(struct icu_casemap * casemap,
                         struct icu_buf_utf16 * dest16,
                         struct icu_buf_utf16 * src16,
-                        UErrorCode *status);
+                        UErrorCode *status,
+                        const char *locale);
 
 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
                       struct icu_buf_utf16 * src16,
                       const char *locale, char action,
                       UErrorCode *status);
 
 
 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
                       struct icu_buf_utf16 * src16,
                       const char *locale, char action,
                       UErrorCode *status);
 
-UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
-                                   struct icu_buf_utf8 * dest8, 
-                                   struct icu_buf_utf16 * src16,
-                                   UErrorCode * status);
+void icu_sortkey8_from_utf16(UCollator *coll,
+                             struct icu_buf_utf8 * dest8, 
+                             struct icu_buf_utf16 * src16,
+                             UErrorCode * status);
 
 struct icu_tokenizer
 {
 
 struct icu_tokenizer
 {
-  char locale[16];
-  char action;
-  UBreakIterator* bi;
-  struct icu_buf_utf16 * buf16;
-  int32_t token_count;
-  int32_t token_id;
-  int32_t token_start;
-  int32_t token_end;
-  // keep always invariant
-  // 0 <= token_start 
-  //   <= token_end 
-  //   <= buf16->utf16_len
-  // and invariant
-  // 0 <= token_id <= token_count
+    char action;
+    UBreakIterator* bi;
+    struct icu_buf_utf16 * buf16;
+    int32_t token_count;
+    int32_t token_id;
+    int32_t token_start;
+    int32_t token_end;
+/*
+  keep always invariant
+  0 <= token_start 
+  <= token_end 
+  <= buf16->utf16_len
+  and invariant
+  0 <= token_id <= token_count
+*/
 };
 
 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 };
 
 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
@@ -151,69 +165,55 @@ int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
 
 
 
 
 
 
-struct icu_normalizer
+struct icu_transform
 {
 {
-  char action;
-  struct icu_buf_utf16 * rules16;
-  UParseError parse_error[256];
-  UTransliterator * trans;
+    char action;
+    UParseError parse_error;
+    UTransliterator * trans;
 };
 
 };
 
-struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
-                                              UErrorCode *status);
-
-
-void icu_normalizer_destroy(struct icu_normalizer * normalizer);
-
-int icu_normalizer_normalize(struct icu_normalizer * normalizer,
-                             struct icu_buf_utf16 * dest16,
-                             struct icu_buf_utf16 * src16,
-                             UErrorCode *status);
-
+struct icu_transform * icu_transform_create(const char *id, char action,
+                                            const char *rules,
+                                            UErrorCode *status);
 
 
-#if 0
-struct icu_token
-{
-  int32_t token_id;
-  uint8_t * display8;
-  uint8_t * norm8;
-  uint8_t * sort8;
-}
-#endif
+void icu_transform_destroy(struct icu_transform * transform);
 
 
+int icu_transform_trans(struct icu_transform * transform,
+                        struct icu_buf_utf16 * dest16,
+                        struct icu_buf_utf16 * src16,
+                        UErrorCode *status);
 
 enum icu_chain_step_type {
 
 enum icu_chain_step_type {
-    ICU_chain_step_type_none,      // 
-    ICU_chain_step_type_display,   // convert to utf8 display format 
-    ICU_chain_step_type_index,     // convert to utf8 index format 
-    ICU_chain_step_type_sortkey,   // convert to utf8 sortkey format 
-    ICU_chain_step_type_casemap,   // apply utf16 charmap
-    ICU_chain_step_type_normalize, // apply utf16 normalization
-    ICU_chain_step_type_tokenize   // apply utf16 tokenization 
+    ICU_chain_step_type_none,
+    ICU_chain_step_type_display,   /* convert to utf8 display format */
+    ICU_chain_step_type_casemap,   /* apply utf16 charmap */
+    ICU_chain_step_type_transform, /* apply utf16 transform */
+    ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
+    ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
 };
 
 
 
 struct icu_chain_step
 {
 };
 
 
 
 struct icu_chain_step
 {
-  // type and action object
-  enum icu_chain_step_type type;
-  union {
-    struct icu_casemap * casemap;
-    struct icu_normalizer * normalizer;
-    struct icu_tokenizer * tokenizer;  
-  } u;
-  // temprary post-action utf16 buffer
-  struct icu_buf_utf16 * buf16;  
-  struct icu_chain_step * previous;
-  int more_tokens;
-  int need_new_token;
+    /* type and action object */
+    enum icu_chain_step_type type;
+    union {
+       struct icu_casemap * casemap;
+       struct icu_transform * transform;
+       struct icu_tokenizer * tokenizer;  
+    } u;
+    /* temprary post-action utf16 buffer */
+    struct icu_buf_utf16 * buf16;  
+    struct icu_chain_step * previous;
+    int more_tokens;
+    int need_new_token;
 };
 
 
 struct icu_chain;
 
 };
 
 
 struct icu_chain;
 
-struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
+struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain,
                                               enum icu_chain_step_type type,
                                               const uint8_t * rule,
                                               struct icu_buf_utf16 * buf16,
                                               enum icu_chain_step_type type,
                                               const uint8_t * rule,
                                               struct icu_buf_utf16 * buf16,
@@ -225,60 +225,54 @@ void icu_chain_step_destroy(struct icu_chain_step * step);
 
 struct icu_chain
 {
 
 struct icu_chain
 {
-  uint8_t identifier[128];
-  uint8_t locale[16];
-
-  // number of tokens returned so far
-  int32_t token_count;
-
-  // utf8 output buffers
-  struct icu_buf_utf8 * display8;
-  struct icu_buf_utf8 * norm8;
-  struct icu_buf_utf8 * sort8;
-
-  // utf16 source buffer
-  struct icu_buf_utf16 * src16;
-
-  // linked list of chain steps
-  struct icu_chain_step * steps;
+    char *locale;
+    int sort;
+
+    const char * src8cstr;
+
+    UCollator * coll;
+    
+    /* number of tokens returned so far */
+    int32_t token_count;
+    
+    /* utf8 output buffers */
+    struct icu_buf_utf8 * display8;
+    struct icu_buf_utf8 * norm8;
+    struct icu_buf_utf8 * sort8;
+    
+    /* utf16 source buffer */
+    struct icu_buf_utf16 * src16;
+    
+    /* linked list of chain steps */
+    struct icu_chain_step * steps;
 };
 
 };
 
-struct icu_chain * icu_chain_create(const uint8_t * identifier, 
-                                    const uint8_t * locale);
-
-void icu_chain_destroy(struct icu_chain * chain);
-
-struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, 
-                                        UErrorCode * status);
-
-
-struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
+struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain,
                                               enum icu_chain_step_type type,
                                               const uint8_t * rule,
                                               UErrorCode *status);
 
                                               enum icu_chain_step_type type,
                                               const uint8_t * rule,
                                               UErrorCode *status);
 
-
-int icu_chain_step_next_token(struct icu_chain * chain,
+int icu_chain_step_next_token(yaz_icu_chain_t chain,
                               struct icu_chain_step * step,
                               UErrorCode *status);
 
                               struct icu_chain_step * step,
                               UErrorCode *status);
 
-int icu_chain_assign_cstr(struct icu_chain * chain,
-                          const char * src8cstr, 
-                          UErrorCode *status);
-
-int icu_chain_next_token(struct icu_chain * chain,
-                         UErrorCode *status);
-
-int icu_chain_get_token_count(struct icu_chain * chain);
-
-const char * icu_chain_get_display(struct icu_chain * chain);
-
-const char * icu_chain_get_norm(struct icu_chain * chain);
+int icu_chain_token_number(yaz_icu_chain_t chain);
 
 
-const char * icu_chain_get_sort(struct icu_chain * chain);
+const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain);
 
 
+yaz_icu_chain_t icu_chain_create(const char * locale,
+                                 int sort,
+                                 UErrorCode * status);
 
 
 
 
+#endif /* ICU_I18NL_H */
 
 
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
 
 
-#endif // ICU_I18NL_H