Merge branch 'master' into stemming

[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h

index c0af4cf..eb0f22d 100644 (file)
--- a/include/yaz/icu_I18N.h
+++ b/include/yaz/icu_I18N.h
@@ -1,5 +1,5 @@
-/*
- * Copyright (c) 1995-2007, Index Data
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2010 Index Data.
   * All rights reserved.
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -27,6 +27,9 @@
  
  /** \file
      \brief Internal header for ICU utilities
+
+    These functions, while non-static, are considered unstable and internal
+    and may be renamed for each YAZ release.
  */
  
  #ifndef ICU_I18NL_H
@@ -34,14 +37,11 @@
  
  #include <yaz/yconfig.h>
  
-#include <libxml/tree.h>
-
  #include <unicode/utypes.h>   /* Basic ICU data types */
  #include <unicode/uchar.h>    /* char names           */
  
-#include <unicode/ucol.h> 
+#include <unicode/ucol.h>
  #include <unicode/ubrk.h>
-#include <unicode/utrans.h>
  
  #include <yaz/icu.h>
  
@@ -63,12 +63,12 @@ struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
  struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
                                              size_t capacity);
  
-struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
-                                          struct icu_buf_utf16 * src16);
+struct icu_buf_utf16 *icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
+                                         const struct icu_buf_utf16 * src16);
  
  void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
  
-
+struct icu_buf_utf8;
  
  struct icu_buf_utf8
  {
@@ -87,26 +87,23 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
  void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
  
  
-UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
-                               struct icu_buf_utf8 * src8,
-                               UErrorCode * status);
-
  UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
                                      const char * src8cstr,
                                      UErrorCode * status);
  
+const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8);
  
-UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
-                             struct icu_buf_utf16 * src16,
+
+UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 *dest8,
+                             const struct icu_buf_utf16 *src16,
                               UErrorCode * status);
  
-struct icu_casemap
-{
-    char action;
-};
+struct icu_casemap;
  
  struct icu_casemap * icu_casemap_create(char action, UErrorCode *status);
  
+struct icu_casemap *icu_casemap_clone(struct icu_casemap *old);
+
  void icu_casemap_destroy(struct icu_casemap * casemap);
  
  int icu_casemap_casemap(struct icu_casemap * casemap,
@@ -125,28 +122,11 @@ void icu_sortkey8_from_utf16(UCollator *coll,
                               struct icu_buf_utf16 * src16,
                               UErrorCode * status);
  
-struct icu_tokenizer
-{
-    char action;
-    UBreakIterator* bi;
-    struct icu_buf_utf16 * buf16;
-    int32_t token_count;
-    int32_t token_id;
-    int32_t token_start;
-    int32_t token_end;
-/*
-  keep always invariant
-  0 <= token_start 
-  <= token_end 
-  <= buf16->utf16_len
-  and invariant
-  0 <= token_id <= token_count
-*/
-};
-
+struct icu_tokenizer;
  struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
                                              UErrorCode *status);
  
+struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old);
  void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
  
  int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
@@ -156,115 +136,37 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
                                   struct icu_buf_utf16 * tkn16, 
                                   UErrorCode *status);
  
-int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
-int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
-int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
-int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
  int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
  
+struct icu_transform;
  
+struct icu_transform * icu_transform_create(const char *id, char action,
+                                            const char *rules,
+                                            UErrorCode *status);
+struct icu_transform *icu_transform_clone(struct icu_transform *old);
+void icu_transform_destroy(struct icu_transform * transform);
  
-struct icu_normalizer
-{
-    char action;
-    struct icu_buf_utf16 * rules16;
-    UParseError parse_error[256];
-    UTransliterator * trans;
-};
-
-struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
-                                              UErrorCode *status);
-
-
-void icu_normalizer_destroy(struct icu_normalizer * normalizer);
-
-int icu_normalizer_normalize(struct icu_normalizer * normalizer,
-                             struct icu_buf_utf16 * dest16,
-                             struct icu_buf_utf16 * src16,
-                             UErrorCode *status);
-
-enum icu_chain_step_type {
-    ICU_chain_step_type_none,
-    ICU_chain_step_type_display,   /* convert to utf8 display format */
-    ICU_chain_step_type_casemap,   /* apply utf16 charmap */
-    ICU_chain_step_type_normalize, /* apply utf16 normalization */
-    ICU_chain_step_type_tokenize   /* apply utf16 tokenization */
-};
-
-
-
-struct icu_chain_step
-{
-    /* type and action object */
-    enum icu_chain_step_type type;
-    union {
-       struct icu_casemap * casemap;
-       struct icu_normalizer * normalizer;
-       struct icu_tokenizer * tokenizer;  
-    } u;
-    /* temprary post-action utf16 buffer */
-    struct icu_buf_utf16 * buf16;  
-    struct icu_chain_step * previous;
-    int more_tokens;
-    int need_new_token;
-};
-
-
-struct icu_chain;
-
-struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain,
-                                              enum icu_chain_step_type type,
-                                              const uint8_t * rule,
-                                              struct icu_buf_utf16 * buf16,
-                                              UErrorCode *status);
-
-
-void icu_chain_step_destroy(struct icu_chain_step * step);
-
-
-struct icu_chain
-{
-    char *locale;
-    int sort;
-
-    const char * src8cstr;
-
-    UCollator * coll;
-    
-    /* number of tokens returned so far */
-    int32_t token_count;
-    
-    /* utf8 output buffers */
-    struct icu_buf_utf8 * display8;
-    struct icu_buf_utf8 * norm8;
-    struct icu_buf_utf8 * sort8;
-    
-    /* utf16 source buffer */
-    struct icu_buf_utf16 * src16;
-    
-    /* linked list of chain steps */
-    struct icu_chain_step * steps;
-};
-
-struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain,
-                                              enum icu_chain_step_type type,
-                                              const uint8_t * rule,
-                                              UErrorCode *status);
+int icu_transform_trans(struct icu_transform * transform,
+                        struct icu_buf_utf16 * dest16,
+                        const struct icu_buf_utf16 * src16,
+                        UErrorCode *status);
  
-int icu_chain_step_next_token(yaz_icu_chain_t chain,
-                              struct icu_chain_step * step,
-                              UErrorCode *status);
+struct icu_chain_step;
  
  int icu_chain_token_number(yaz_icu_chain_t chain);
  
-const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain);
+yaz_icu_chain_t icu_chain_create(const char * locale,
+                                 int sort,
+                                 UErrorCode * status);
  
  #endif /* ICU_I18NL_H */
  
  /*
   * Local variables:
   * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
   * indent-tabs-mode: nil
   * End:
   * vim: shiftwidth=4 tabstop=8 expandtab
   */
+