ICU chain of normalizers and tokenizers half-way implemented
authorMarc Cromme <marc@indexdata.dk>
Mon, 14 May 2007 13:51:24 +0000 (13:51 +0000)
committerMarc Cromme <marc@indexdata.dk>
Mon, 14 May 2007 13:51:24 +0000 (13:51 +0000)
src/icu_I18N.c
src/icu_I18N.h

index 0508719..39c8716 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.c,v 1.11 2007-05-11 10:38:42 marc Exp $
+/* $Id: icu_I18N.c,v 1.12 2007-05-14 13:51:24 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
    This file is part of Pazpar2.
@@ -712,6 +712,77 @@ int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 
 
 
+struct icu_chain * icu_chain_create(const uint8_t * identifier, 
+                                    const uint8_t * locale)
+{
+
+    struct icu_chain * chain 
+        = (struct icu_chain *) malloc(sizeof(struct icu_chain));
+
+    strncpy((char *) chain->identifier, (const char *) identifier, 128);
+    chain->identifier[128 - 1] = '\0';
+    strncpy((char *) chain->locale, (const char *) locale, 16);    
+    chain->locale[16 - 1] = '\0';
+
+    chain->token_count = 0;
+
+    chain->display8 = icu_buf_utf8_create(0);
+    chain->norm8 = icu_buf_utf8_create(0);
+    chain->sort8 = icu_buf_utf8_create(0);
+
+    chain->src16 = icu_buf_utf16_create(0);
+
+    chain->steps = 0;
+
+    return chain;
+};
+
+void icu_chain_destroy(struct icu_chain * chain)
+{
+    icu_buf_utf8_destroy(chain->display8);
+    icu_buf_utf8_destroy(chain->norm8);
+    icu_buf_utf8_destroy(chain->sort8);
+
+    icu_buf_utf16_destroy(chain->src16);
+
+    icu_chain_step_destroy(chain->steps);
+};
+
+struct icu_chain_step * icu_chain_append_step(struct icu_chain * chain,
+                                         enum icu_chain_step_type type,
+                                         const uint8_t * rule)
+{
+    
+    struct icu_chain_step * step 
+        = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
+
+
+
+    return step;
+};
+
+void icu_chain_step_destroy(struct icu_chain_step * step){
+    
+    if (!step)
+        return;
+    
+    if (step->next)
+        icu_chain_step_destroy(step->next);
+
+    // destroy last living icu_chain-step
+    switch(step->type) {
+    case ICU_chain_step_type_normalize:
+        icu_normalizer_destroy(step->u.normalizer);
+        break;
+    case ICU_chain_step_type_tokenize:
+        icu_tokenizer_destroy(step->u.tokenizer);
+        break;
+    default:
+        break;
+    }
+};
+
+
 
 #endif // HAVE_ICU    
 
index 74adfc4..2746f07 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.h,v 1.11 2007-05-11 10:38:42 marc Exp $
+/* $Id: icu_I18N.h,v 1.12 2007-05-14 13:51:24 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
    This file is part of Pazpar2.
 
 
 
-// forward declarations
-//struct UBreakIterator;
-
-
-
-
 // declared structs and functions
 
-
 int icu_check_status (UErrorCode status);
 
 struct icu_buf_utf16
@@ -162,6 +155,73 @@ int icu_normalizer_normalize(struct icu_normalizer * normalizer,
                              UErrorCode *status);
 
 
+#if 0
+struct icu_token
+{
+  int32_t token_id;
+  uint8_t * display8;
+  uint8_t * norm8;
+  uint8_t * sort8;
+}
+#endif
+
+enum icu_chain_step_type {
+    ICU_chain_step_type_none,      // 
+    ICU_chain_step_type_display,   // convert to utf8 display format 
+    ICU_chain_step_type_norm,      // convert to utf8 norm format 
+    ICU_chain_step_type_sort,      // convert to utf8 sort format 
+    ICU_chain_step_type_charmap,   // apply utf16 charmap
+    ICU_chain_step_type_normalize, // apply utf16 normalization
+    ICU_chain_step_type_tokenize   // apply utf16 tokenization 
+};
+
+
+
+struct icu_chain_step
+{
+  // type and action object
+  enum icu_chain_step_type type;
+  union {
+    struct icu_normalizer * normalizer;
+    struct icu_tokenizer * tokenizer;  
+  } u;
+  // temprary post-action utf16 buffer
+  struct icu_buf_utf16 * buf16;  
+  struct icu_chain_step * next;
+};
+
+
+struct icu_chain
+{
+  uint8_t identifier[128];
+  uint8_t locale[16];
+
+  // number of tokens returned so far
+  int32_t token_count;
+
+  // utf8 output buffers
+  struct icu_buf_utf8 * display8;
+  struct icu_buf_utf8 * norm8;
+  struct icu_buf_utf8 * sort8;
+
+  // utf16 source buffer
+  struct icu_buf_utf16 * src16;
+
+  // linked list of chain steps
+  struct icu_chain_step * steps;
+};
+
+struct icu_chain * icu_chain_create(const uint8_t * identifier, 
+                                    const uint8_t * locale);
+
+void icu_chain_destroy(struct icu_chain * chain);
+
+struct icu_chain_step * icu_chain_append_step(struct icu_chain * chain,
+                                              enum icu_chain_step_type type,
+                                              const uint8_t * rule);
+
+void icu_chain_step_destroy(struct icu_chain_step * step);
+
 
 
 #endif // HAVE_ICU