Split ICU wrapper library into several sources
[yaz-moved-to-github.git] / src / icu_tokenizer.c
diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c
new file mode 100644 (file)
index 0000000..c7bba7f
--- /dev/null
@@ -0,0 +1,188 @@
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2009 Index Data
+ * See the file LICENSE for details.
+ */
+
+/**
+ * \file
+ * \brief ICU tokenization - using ubrk_-functions from ICU
+ */
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if YAZ_HAVE_ICU
+#include <yaz/xmalloc.h>
+
+#include <yaz/icu_I18N.h>
+
+#include <yaz/log.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <unicode/ustring.h>  /* some more string fcns*/
+#include <unicode/uchar.h>    /* char names           */
+
+struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
+                                           UErrorCode *status)
+{
+    struct icu_tokenizer * tokenizer
+        = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
+
+    tokenizer->action = action;
+    tokenizer->bi = 0;
+    tokenizer->buf16 = 0;
+    tokenizer->token_count = 0;
+    tokenizer->token_id = 0;
+    tokenizer->token_start = 0;
+    tokenizer->token_end = 0;
+
+    switch (tokenizer->action)
+    {    
+    case 'l':
+    case 'L':
+        tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
+        break;
+    case 's':
+    case 'S':
+        tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
+        break;
+    case 'w':
+    case 'W':
+        tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
+        break;
+    case 'c':
+    case 'C':
+        tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
+        break;
+    case 't':
+    case 'T':
+        tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
+        break;
+    default:
+        *status = U_UNSUPPORTED_ERROR;
+        return 0;
+        break;
+    }
+    
+    /* ICU error stuff is a very  funny business */
+    if (U_SUCCESS(*status))
+        return tokenizer;
+
+    /* freeing if failed */
+    icu_tokenizer_destroy(tokenizer);
+    return 0;
+}
+
+void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
+{
+    if (tokenizer)
+    {
+        if (tokenizer->bi)
+            ubrk_close(tokenizer->bi);
+        xfree(tokenizer);
+    }
+}
+
+int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
+                         struct icu_buf_utf16 * src16, 
+                         UErrorCode *status)
+{
+    if (!tokenizer || !tokenizer->bi || !src16)
+        return 0;
+
+    tokenizer->buf16 = src16;
+    tokenizer->token_count = 0;
+    tokenizer->token_id = 0;
+    tokenizer->token_start = 0;
+    tokenizer->token_end = 0;
+
+    ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+     
+    if (U_FAILURE(*status))
+        return 0;
+
+    return 1;
+}
+
+int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
+                                 struct icu_buf_utf16 * tkn16, 
+                                 UErrorCode *status)
+{
+    int32_t tkn_start = 0;
+    int32_t tkn_end = 0;
+    int32_t tkn_len = 0;
+
+    if (!tokenizer || !tokenizer->bi
+        || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
+        return 0;
+    /*
+    never change tokenizer->buf16 and keep always invariant
+    0 <= tokenizer->token_start 
+       <= tokenizer->token_end 
+       <= tokenizer->buf16->utf16_len
+    returns length of token
+    */
+
+    if (0 == tokenizer->token_end) /* first call */
+        tkn_start = ubrk_first(tokenizer->bi);
+    else /* successive calls */
+        tkn_start = tokenizer->token_end;
+
+    /* get next position */
+    tkn_end = ubrk_next(tokenizer->bi);
+
+    /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
+    if (UBRK_DONE == tkn_end)
+        tkn_end = tokenizer->buf16->utf16_len;
+
+    /* copy out if everything is well */
+    if (U_FAILURE(*status))
+        return 0;        
+    
+    /* everything OK, now update internal state */
+    tkn_len = tkn_end - tkn_start;
+
+    if (0 < tkn_len)
+    {
+        tokenizer->token_count++;
+        tokenizer->token_id++;
+    } else {
+        tokenizer->token_id = 0;    
+    }
+    tokenizer->token_start = tkn_start;
+    tokenizer->token_end = tkn_end;    
+
+    /* copying into token buffer if it exists */
+    if (tkn16){
+        if (tkn16->utf16_cap < tkn_len)
+            icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
+
+        u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], 
+                  tkn_len);
+
+        tkn16->utf16_len = tkn_len;
+    }
+
+    return tkn_len;
+}
+
+int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
+{
+    return tokenizer->token_count;
+}
+
+#endif /* YAZ_HAVE_ICU */
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+