Refactor ssl_put and tcpip_put to one function YAZ-832

[yaz-moved-to-github.git] / src / icu_tokenizer.c
diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c

index e8d3123..bf82c5b 100644 (file)
--- a/src/icu_tokenizer.c
+++ b/src/icu_tokenizer.c
@@ -1,5 +1,5 @@
  /* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2010 Index Data
+ * Copyright (C) Index Data
   * See the file LICENSE for details.
   */
  
@@ -38,8 +38,8 @@ struct icu_tokenizer
      int32_t token_end;
  /*
    keep always invariant
-  0 <= token_start 
-  <= token_end 
+  0 <= token_start
+  <= token_end
    <= buf16->utf16_len
    and invariant
    0 <= token_id <= token_count
@@ -61,8 +61,8 @@ static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
  
  struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
  {
-    uint32_t bufferSize = 10000;
-    UErrorCode status = 0;
+    int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
+    UErrorCode status = U_ZERO_ERROR;
      struct icu_tokenizer * tokenizer
          = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  
@@ -78,12 +78,12 @@ struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
  struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
                                             UErrorCode *status)
  {
-    struct icu_tokenizer * tokenizer
+    struct icu_tokenizer *tokenizer
          = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  
      icu_tokenizer_reset(tokenizer, action);
      switch (tokenizer->action)
-    {    
+    {
      case 'l':
      case 'L':
          tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
@@ -109,7 +109,7 @@ struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
          return 0;
          break;
      }
-    
+
      /* ICU error stuff is a very  funny business */
      if (U_SUCCESS(*status))
          return tokenizer;
@@ -119,7 +119,7 @@ struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
      return 0;
  }
  
-void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
+void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
  {
      if (tokenizer)
      {
@@ -130,8 +130,8 @@ void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
      }
  }
  
-int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
-                         struct icu_buf_utf16 * src16, 
+int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
+                         struct icu_buf_utf16 *src16,
                           UErrorCode *status)
  {
      if (!tokenizer || !tokenizer->bi || !src16)
@@ -146,16 +146,17 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
  
      ubrk_setText(tokenizer->bi,
                   tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
-     
+
      if (U_FAILURE(*status))
          return 0;
  
      return 1;
  }
  
-int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
-                                 struct icu_buf_utf16 * tkn16, 
-                                 UErrorCode *status)
+int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
+                                 struct icu_buf_utf16 *tkn16,
+                                 UErrorCode *status,
+                                 size_t *start, size_t *len)
  {
      int32_t tkn_start = 0;
      int32_t tkn_end = 0;
@@ -166,8 +167,8 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
          return 0;
      /*
      never change tokenizer->buf16 and keep always invariant
-    0 <= tokenizer->token_start 
-       <= tokenizer->token_end 
+    0 <= tokenizer->token_start
+       <= tokenizer->token_end
         <= tokenizer->buf16->utf16_len
      returns length of token
      */
@@ -186,8 +187,8 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
  
      /* copy out if everything is well */
      if (U_FAILURE(*status))
-        return 0;        
-    
+        return 0;
+
      /* everything OK, now update internal state */
      tkn_len = tkn_end - tkn_start;
  
@@ -195,18 +196,23 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
      {
          tokenizer->token_count++;
          tokenizer->token_id++;
-    } else {
-        tokenizer->token_id = 0;    
      }
+    else
+        tokenizer->token_id = 0;
+
      tokenizer->token_start = tkn_start;
-    tokenizer->token_end = tkn_end;    
+    tokenizer->token_end = tkn_end;
+
+    *start = tkn_start;
+    *len = tkn_end - tkn_start;
  
      /* copying into token buffer if it exists */
-    if (tkn16){
+    if (tkn16)
+    {
          if (tkn16->utf16_cap < tkn_len)
              icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
  
-        u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], 
+        u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
                    tkn_len);
  
          tkn16->utf16_len = tkn_len;
@@ -215,7 +221,7 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
      return tkn_len;
  }
  
-int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
+int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
  {
      return tokenizer->token_count;
  }