ICU tokenizer works now

author Marc Cromme <marc@indexdata.dk>

Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)

committer Marc Cromme <marc@indexdata.dk>

Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)
author Marc Cromme <marc@indexdata.dk>
Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)
committer Marc Cromme <marc@indexdata.dk>
Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)
diff --git a/src/icu_I18N.c b/src/icu_I18N.c

index b7ba91d..fa9bd82 100644 (file)
--- a/src/icu_I18N.c
+++ b/src/icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.c,v 1.7 2007-05-07 12:52:04 marc Exp $
+/* $Id: icu_I18N.c,v 1.8 2007-05-09 14:01:21 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
     This file is part of Pazpar2.
@@ -55,11 +55,13 @@
  
  int icu_check_status (UErrorCode status)
  {
-    //if(U_FAILURE(status))
-    if(!U_SUCCESS(status))
+    if(U_FAILURE(status)){
          yaz_log(YLOG_WARN, 
                  "ICU: %d %s\n", status, u_errorName(status));
-    return status;
+        return 0;   
+    }
+    return 1;
+    
  }
  
  
@@ -151,7 +153,8 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
                  buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
              else
                  buf8->utf8 
-                    = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity);
+                    = (uint8_t *) realloc(buf8->utf8, 
+                                          sizeof(uint8_t) * capacity);
              buf8->utf8[0] = (uint8_t) 0;
              buf8->utf8_len = 0;
              buf8->utf8_cap = capacity;
@@ -405,6 +408,175 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
  
  
  
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+                                            UErrorCode *status)
+{
+    struct icu_tokenizer * tokenizer
+        = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
+
+    strcpy(tokenizer->locale, locale);
+    tokenizer->action = action;
+    tokenizer->bi = 0;
+    tokenizer->buf16 = 0;
+    tokenizer->token_id = 0;
+    tokenizer->token_start = 0;
+    tokenizer->token_end = 0;
+
+
+    switch(tokenizer->action) {    
+    case 'l':
+        tokenizer->bi
+            = ubrk_open(UBRK_LINE, tokenizer->locale,
+                        0, 0, status);
+        break;
+    case 's':
+        tokenizer->bi
+            = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
+                        0, 0, status);
+        break;
+    case 'w':
+        tokenizer->bi 
+            = ubrk_open(UBRK_WORD, tokenizer->locale,
+                        0, 0, status);
+        break;
+    case 'c':
+        tokenizer->bi 
+            = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
+                        0, 0, status);
+        break;
+    case 't':
+        tokenizer->bi 
+            = ubrk_open(UBRK_TITLE, tokenizer->locale,
+                        0, 0, status);
+        break;
+    default:
+        *status = U_UNSUPPORTED_ERROR;
+        return 0;
+        break;
+    }
+    
+    // ICU error stuff is a very  funny business
+    if (U_SUCCESS(*status))
+        return tokenizer;
+
+    // reestablishing zero error state
+    //if (*status == U_USING_DEFAULT_WARNING)
+    //    *status = U_ZERO_ERROR;
+ 
+
+    // freeing if failed
+    free(tokenizer);
+    return 0;
+};
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
+{
+
+    if (tokenizer) {
+        if (tokenizer->bi)
+            ubrk_close(tokenizer->bi);
+        free(tokenizer);
+    }
+};
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
+                         struct icu_buf_utf16 * src16, 
+                         UErrorCode *status)
+{
+    if (!tokenizer || !tokenizer->bi || !src16)
+        return 0;
+
+    tokenizer->buf16 = src16;
+
+    ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+    
+ 
+    if (U_FAILURE(*status))
+        return 0;
+
+    return 1;
+};
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
+                         struct icu_buf_utf16 * tkn16, 
+                         UErrorCode *status)
+{
+    int32_t tkn_start = 0;
+    int32_t tkn_end = 0;
+    
+
+    if (!tokenizer || !tokenizer->bi
+        || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
+        return 0;
+
+    // never change tokenizer->buf16 and keep always invariant
+    // 0 <= tokenizer->token_start 
+    //   <= tokenizer->token_end 
+    //   <= tokenizer->buf16->utf16_len
+    // returns length of token
+
+    if (0 == tokenizer->token_end) // first call
+        tkn_start = ubrk_first(tokenizer->bi);
+    else //successive calls
+        tkn_start = tokenizer->token_end;
+
+    // get next position
+    tkn_end = ubrk_next(tokenizer->bi);
+
+    // repairing invariant at end of ubrk, which is UBRK_DONE = -1 
+    if (UBRK_DONE == tkn_end)
+        tkn_end = tokenizer->buf16->utf16_len;
+
+    // copy out if everything is well
+    if(U_FAILURE(*status))
+        return 0;        
+        
+    tokenizer->token_id++;
+    tokenizer->token_start = tkn_start;
+    tokenizer->token_end = tkn_end;
+    
+    // copying into token buffer if it exists 
+    if (tkn16){
+        if (tkn16->utf16_cap < (tkn_end - tkn_start))
+            icu_buf_utf16_resize(tkn16, (size_t) (tkn_end - tkn_start) * 2);
+
+        u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], 
+                  (tkn_end - tkn_start));
+
+        tkn16->utf16_len = (tkn_end - tkn_start);
+    }
+
+    return (tokenizer->token_end - tokenizer->token_start);
+}
+
+
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
+{
+    return tokenizer->token_id;
+};
+
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
+{
+    return tokenizer->token_start;
+};
+
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
+{
+    return tokenizer->token_end;
+};
+
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
+{
+    return (tokenizer->token_end - tokenizer->token_start);
+};
+
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
+{
+    return tokenizer->token_count;
+};
+
+
+
  
  #endif // HAVE_ICU    
  
diff --git a/src/icu_I18N.h b/src/icu_I18N.h

index 803d89b..df6cd2d 100644 (file)
--- a/src/icu_I18N.h
+++ b/src/icu_I18N.h
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.h,v 1.7 2007-05-07 12:52:04 marc Exp $
+/* $Id: icu_I18N.h,v 1.8 2007-05-09 14:01:21 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
     This file is part of Pazpar2.
@@ -35,10 +35,19 @@
  //#include <unicode/ucnv.h>     /* C   Converter API    */
  //#include <unicode/ustring.h>  /* some more string fcns*/
  //#include <unicode/uloc.h>
-//#include <unicode/ubrk.h>
+#include <unicode/ubrk.h>
  //#include <unicode/unistr.h>
  
  
+// forward declarations
+//struct UBreakIterator;
+
+
+
+
+// declared structs and functions
+
+
  int icu_check_status (UErrorCode status);
  
  struct icu_buf_utf16
@@ -91,6 +100,44 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
                                     struct icu_buf_utf16 * src16,
                                     UErrorCode * status);
  
+struct icu_tokenizer
+{
+  char locale[16];
+  char action;
+  UBreakIterator* bi;
+  struct icu_buf_utf16 * buf16;
+  int32_t token_count;
+  int32_t token_id;
+  int32_t token_start;
+  int32_t token_end;
+  // keep always invariant
+  // 0 <= token_start 
+  //   <= token_end 
+  //   <= buf16->utf16_len
+  // and invariant
+  // 0 <= token_id <= token_count
+};
+
+struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
+                                            UErrorCode *status);
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
+                         struct icu_buf_utf16 * src16, UErrorCode *status);
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
+                                 struct icu_buf_utf16 * tkn16, 
+                                 UErrorCode *status);
+
+int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
+
+
+
  
  #endif // HAVE_ICU
  #endif // ICU_I18NL_H
diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c

index c9d3e39..992922a 100644 (file)
--- a/src/test_icu_I18N.c
+++ b/src/test_icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: test_icu_I18N.c,v 1.10 2007-05-07 12:52:04 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.11 2007-05-09 14:01:21 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
     This file is part of Pazpar2.
@@ -209,7 +209,7 @@ int test_icu_sortmap(const char * locale, int src_list_len,
      UCollator *coll = ucol_open(locale, &status); 
      icu_check_status(status);
  
-    if(!U_SUCCESS(status))
+    if(U_FAILURE(status))
          return 0;
  
      // assigning display terms and sort keys using buf 8 and buf16
@@ -312,6 +312,75 @@ void test_icu_I18N_sortmap(int argc, char **argv)
  }
  
  
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_normmap(int argc, char **argv)
+{
+
+
+}
+
+
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_tokenizer(int argc, char **argv)
+{
+
+    const char * src8cstr 
+        = "Though I am not naturally honest, I am so sometimes by chance.";
+
+    UErrorCode status = U_ZERO_ERROR;
+    struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
+    struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
+    struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
+
+    printf("Input:  '%s'\n", src8cstr);
+
+    // transforming to UTF16
+    icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
+    icu_check_status(status);
+
+    // set up tokenizer
+    struct icu_tokenizer * tokenizer 
+        = icu_tokenizer_create("en", 's', &status);
+    icu_check_status(status);
+    YAZ_CHECK(tokenizer);
+
+    // attach text buffer to tokenizer
+    icu_tokenizer_attach(tokenizer, src16, &status);    
+    icu_check_status(status);
+    YAZ_CHECK(tokenizer->bi);
+
+    // perform work on tokens
+    printf("Tokens: ");
+    while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
+        icu_check_status(status);
+
+        // converting to UTF8
+        icu_utf16_to_utf8(tkn8, tkn16, &status);
+
+        printf("'%s' ", tkn8->utf8);
+        
+        //printf("token %d %d %d %d '%s'\n",
+        //       icu_tokenizer_token_id(tokenizer),
+        //       icu_tokenizer_token_start(tokenizer),
+        //       icu_tokenizer_token_end(tokenizer),
+        //       icu_tokenizer_token_length(tokenizer),
+        //       tkn8->utf8);
+    }
+    printf(" (%d)(%d)\n", icu_tokenizer_token_id(tokenizer),
+           icu_tokenizer_token_count(tokenizer));
+
+    icu_tokenizer_destroy(tokenizer);
+    icu_buf_utf16_destroy(src16);
+    icu_buf_utf16_destroy(tkn16);
+    icu_buf_utf8_destroy(tkn8);
+}
+
+
+
+
+
  #endif // HAVE_ICU
  
  // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
@@ -327,7 +396,9 @@ int main(int argc, char **argv)
      //test_icu_I18N_casemap_failures(argc, argv);
      test_icu_I18N_casemap(argc, argv);
      test_icu_I18N_sortmap(argc, argv);
- 
+    test_icu_I18N_normmap(argc, argv);
+    test_icu_I18N_tokenizer(argc, argv);
+
  #else // HAVE_ICU
  
      printf("ICU unit tests omitted.\n"
author	Marc Cromme <marc@indexdata.dk>
	Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)
committer	Marc Cromme <marc@indexdata.dk>
	Wed, 9 May 2007 14:01:21 +0000 (14:01 +0000)
src/icu_I18N.c		patch \| blob \| history
src/icu_I18N.h		patch \| blob \| history
src/test_icu_I18N.c		patch \| blob \| history