X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=util%2Fcharmap.c;h=b36282f412f4275ab0fbf567355986d78387f546;hp=31642bcea4c4a67f445250ca0915105662cc7590;hb=7c3a0352f0492609a3b6b26b63a72b0b2d207aab;hpb=cad3d9cf6b923f0bd0adb6731db88e3ff6bac80a

diff --git a/util/charmap.c b/util/charmap.c
index 31642bc..b36282f 100644
--- a/util/charmap.c
+++ b/util/charmap.c
@@ -1,5 +1,5 @@
-/* $Id: charmap.c,v 1.27 2003-01-13 10:53:16 oleg Exp $
-   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
+/* $Id: charmap.c,v 1.30 2004-09-14 14:38:08 quinn Exp $
+   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
    Index Data Aps
 
 This file is part of the Zebra server.
@@ -40,9 +40,12 @@ typedef unsigned ucs4_t;
 #define CHR_MAXSTR 1024
 #define CHR_MAXEQUIV 32
 
+const unsigned char CHR_FIELD_BEGIN = '^';
+
 const char *CHR_UNKNOWN = "\001";
 const char *CHR_SPACE   = "\002";
-const char *CHR_BASE    = "\003";
+const char *CHR_CUT     = "\003";
+const char *CHR_BASE    = "\005";
 
 struct chrmaptab_info
 {
@@ -141,7 +144,7 @@ static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len)
     return t->target ? t : 0;
 }
 
-static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
+static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len, int first)
 {
     chr_t_entry *res;
 
@@ -152,35 +155,49 @@ static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
 	from++;
 	len++;
     }
-    if (*len > 0 && t->children && t->children[(unsigned char) **from])
+    if (*len > 0 && t->children)
     {
 	const char *old_from = *from;
 	int old_len = *len;
+
+	res = 0;
+
+	if (first && t->children[CHR_FIELD_BEGIN])
+	{
+	    if ((res = find_entry_x(t->children[CHR_FIELD_BEGIN], from, len, 0)) && res != t->children[CHR_FIELD_BEGIN])
+		return res;
+            else
+	        res = 0;
+	    /* otherwhise there was no match on beginning of field, move on */
+	} 
 	
-	(*len)--;
-	(*from)++;
-	if ((res = find_entry_x(t->children[(unsigned char) *old_from],
-				from, len)))
-	    return res;
-	/* no match */
-	*len = old_len;
-	*from = old_from;
+	if (!res && t->children[(unsigned char) **from])
+	{
+	    (*len)--;
+	    (*from)++;
+	    if ((res = find_entry_x(t->children[(unsigned char) *old_from],
+				    from, len, 0)))
+		return res;
+	    /* no match */
+	    *len = old_len;
+	    *from = old_from;
+	}
     }
     /* no children match. use ourselves, if we have a target */
     return t->target ? t : 0;
 }
 
-const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len)
+const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len, int first)
 {
     chr_t_entry *t = maptab->input;
     chr_t_entry *res;
 
-    if (!(res = find_entry_x(t, from, len)))
+    if (!(res = find_entry_x(t, from, len, first)))
 	abort();
     return (const char **) (res->target);
 }
 
-const char **chr_map_input(chrmaptab maptab, const char **from, int len)
+const char **chr_map_input(chrmaptab maptab, const char **from, int len, int first)
 {
     chr_t_entry *t = maptab->input;
     chr_t_entry *res;
@@ -188,7 +205,7 @@ const char **chr_map_input(chrmaptab maptab, const char **from, int len)
 
     len_tmp[0] = len;
     len_tmp[1] = -1;
-    if (!(res = find_entry_x(t, from, len_tmp)))
+    if (!(res = find_entry_x(t, from, len_tmp, first)))
 	abort();
     return (const char **) (res->target);
 }
@@ -244,13 +261,21 @@ unsigned char zebra_prim(char **s)
     return c;
 }
 
+static int zebra_ucs4_strlen(ucs4_t *s)
+{
+    int i = 0;
+    while (*s++)
+	i++;
+    return i;
+}
+
 ucs4_t zebra_prim_w(ucs4_t **s)
 {
     ucs4_t c;
     ucs4_t i = 0;
     char fmtstr[8];
 
-    yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s);
+    yaz_log (LOG_DEBUG, "prim_w %.3s", (char *) *s);
     if (**s == '\\')
     {
 	(*s)++;
@@ -263,13 +288,16 @@ ucs4_t zebra_prim_w(ucs4_t **s)
 	case 't': c = '\t'; (*s)++; break;
 	case 's': c = ' '; (*s)++; break;
 	case 'x': 
-            fmtstr[0] = (*s)[0];
-            fmtstr[1] = (*s)[1];
-            fmtstr[2] = (*s)[2];
-            fmtstr[3] = 0;
-            sscanf(fmtstr, "x%2x", &i);
-            c = i;
-            *s += 3; break;
+	    if (zebra_ucs4_strlen(*s) >= 3)
+	    {
+		fmtstr[0] = (*s)[1];
+		fmtstr[1] = (*s)[2];
+		fmtstr[2] = 0;
+		sscanf(fmtstr, "%x", &i);
+		c = i;
+		*s += 3;
+	    }
+	    break;
         case '0':
         case '1':
         case '2':
@@ -280,14 +308,30 @@ ucs4_t zebra_prim_w(ucs4_t **s)
         case '7':
         case '8':
         case '9':
-            fmtstr[0] = (*s)[0];
-            fmtstr[1] = (*s)[1];
-            fmtstr[2] = (*s)[2];
-            fmtstr[3] = 0;
-	    sscanf(fmtstr, "%3o", &i);
-            c = i;
-            *s += 3;
+	    if (zebra_ucs4_strlen(*s) >= 3)
+	    {
+		fmtstr[0] = (*s)[0];
+		fmtstr[1] = (*s)[1];
+		fmtstr[2] = (*s)[2];
+		fmtstr[3] = 0;
+		sscanf(fmtstr, "%o", &i);
+		c = i;
+		*s += 3;
+	    }
             break;
+	case 'L':
+	    if (zebra_ucs4_strlen(*s) >= 5)
+	    {
+		fmtstr[0] = (*s)[1];
+		fmtstr[1] = (*s)[2];
+		fmtstr[2] = (*s)[3];
+		fmtstr[3] = (*s)[4];
+		fmtstr[4] = 0;
+		sscanf(fmtstr, "%x", &i);
+		c = i;
+		*s += 5;
+	    }
+	    break;
         default:
             (*s)++;
 	}
@@ -327,6 +371,17 @@ static void fun_addspace(const char *s, void *data, int num)
 				(char*) CHR_SPACE, 0);
 }
 
+/* 
+ * Callback function.
+ * Add a space-entry to the value space.
+ */
+static void fun_addcut(const char *s, void *data, int num)
+{
+    chrmaptab tab = (chrmaptab) data;
+    tab->input = set_map_string(tab->input, tab->nmem, s, strlen(s),
+				(char*) CHR_CUT, 0);
+}
+
 /*
  * Create a string containing the mapped characters provided.
  */
@@ -335,7 +390,7 @@ static void fun_mkstring(const char *s, void *data, int num)
     chrwork *arg = (chrwork *) data;
     const char **res, *p = s;
 
-    res = chr_map_input(arg->map, &s, strlen(s));
+    res = chr_map_input(arg->map, &s, strlen(s), 0);
     if (*res == (char*) CHR_UNKNOWN)
 	logf(LOG_WARN, "Map: '%s' has no mapping", p);
     strncat(arg->string, *res, CHR_MAXSTR - strlen(arg->string));
@@ -386,6 +441,8 @@ static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen,
         ret = yaz_iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
         if (ret == (size_t) (-1))
         {
+	    yaz_log(LOG_LOG, "from: %2X %2X %2X %2X",
+		    from[0], from[1], from[2], from[3]);
             yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence");
             return -1;
         }
@@ -402,6 +459,7 @@ static int scan_string(char *s_native,
     char str[1024];
 
     ucs4_t arg[512];
+    ucs4_t arg_prim[512];
     ucs4_t *s0, *s = arg;
     ucs4_t c, begin, end;
     size_t i;
@@ -457,11 +515,11 @@ static int scan_string(char *s_native,
 	case '[': s++; abort(); break;
 	case '(':
             ++s;
-            s0 = s;
-            while (*s != ')' || s[-1] == '\\')
-                s++;
-	    *s = 0;
-            if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1))
+	    s0 = s; i = 0;
+	    while (*s != ')' || s[-1] == '\\')
+		arg_prim[i++] = zebra_prim_w(&s);
+	    arg_prim[i] = 0;
+            if (scan_to_utf8 (t_utf8, arg_prim, zebra_ucs4_strlen(arg_prim), str, sizeof(str)-1))
                 return -1;
 	    (*fun)(str, data, num ? (*num)++ : 0);
 	    s++;
@@ -583,7 +641,7 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only,
 	{
 	    if (argc != 2)
 	    {
-		logf(LOG_FATAL, "Syntax error in charmap");
+		logf(LOG_FATAL, "Syntax error in charmap for space");
 		++errors;
 	    }
 	    if (scan_string(argv[1], t_unicode, t_utf8,
@@ -593,6 +651,20 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only,
 		++errors;
 	    }
 	}
+	else if (!map_only && !yaz_matchstr(argv[0], "cut"))
+	{
+	    if (argc != 2)
+	    {
+		logf(LOG_FATAL, "Syntax error in charmap for cut");
+		++errors;
+	    }
+	    if (scan_string(argv[1], t_unicode, t_utf8,
+                            fun_addcut, res, 0) < 0)
+	    {
+		logf(LOG_FATAL, "Bad cut specification");
+		++errors;
+	    }
+	}
 	else if (!yaz_matchstr(argv[0], "map"))
 	{
 	    chrwork buf;
@@ -648,20 +720,23 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only,
 	     * zebra need to comment next part of code.
 	     */
 
-	    /*
+	    /* Original code */
+#if 1
             if (t_unicode != 0)
                 yaz_iconv_close (t_unicode);
             t_unicode = yaz_iconv_open (ucs4_native, argv[1]);
-	    */
-	    
+#endif
 	    /*
 	     * Fix me. It is additional staff for conversion of characters from local encoding
 	     * of *.chr file to UTF-8 (internal encoding).
 	     * NOTE: The derective encoding must be first directive in *.chr file.
 	     */
+	    /* For whatever reason Oleg enabled this.. */
+#if 0
 	    if (t_utf8 != 0)
         	yaz_iconv_close(t_utf8);
 	    t_utf8 = yaz_iconv_open ("UTF-8", argv[1]);
+#endif
         }
 	else
 	{