From 570bf3746bef524124e2a90a2999981a1c734b20 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 23 Oct 2007 12:26:25 +0000 Subject: [PATCH] First parts of index_rules system (ICU support). --- include/Makefile.am | 4 +- include/index_rules.h | 93 +++++++++++++++++++++++ include/rob_regexp.h | 46 ++++++++++++ util/Makefile.am | 9 ++- util/index_rules.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++ util/rob_regexp.c | 88 ++++++++++++++++++++++ util/tst_index_rules.c | 113 ++++++++++++++++++++++++++++ 7 files changed, 544 insertions(+), 5 deletions(-) create mode 100644 include/index_rules.h create mode 100644 include/rob_regexp.h create mode 100644 util/index_rules.c create mode 100644 util/rob_regexp.c create mode 100644 util/tst_index_rules.c diff --git a/include/Makefile.am b/include/Makefile.am index c8a4834..f176108 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -1,7 +1,7 @@ -# $Id: Makefile.am,v 1.25 2006-11-21 14:32:38 adam Exp $ +# $Id: Makefile.am,v 1.26 2007-10-23 12:26:25 adam Exp $ noinst_HEADERS = bset.h charmap.h \ direntz.h passwddb.h dfa.h zebra_xpath.h d1_absyn.h \ rset.h dfaset.h sortidx.h zebra-lock.h attrfind.h zebramap.h \ - it_key.h su_codec.h + it_key.h su_codec.h index_rules.h rob_regexp.h SUBDIRS = idzebra diff --git a/include/index_rules.h b/include/index_rules.h new file mode 100644 index 0000000..5966842 --- /dev/null +++ b/include/index_rules.h @@ -0,0 +1,93 @@ +/* $Id: index_rules.h,v 1.1 2007-10-23 12:26:25 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +/** + \brief Definitions for Zebra's index rules system +*/ + +#ifndef ZEBRA_INDEX_RULES_H +#define ZEBRA_INDEX_RULES_H + +#include +#include + +YAZ_BEGIN_CDECL + +typedef struct zebra_index_rules_s *zebra_index_rules_t; + +/** \brief creates index rules handler/object from file + \param fname filename + \returns handle (NULL if unsuccessful) + + Config file format: + \verbatim + + + + + + + + + + + + \endverbatim + */ +zebra_index_rules_t zebra_index_rules_create(const char *fname); + +/** \brief destroys index rules object + \param r handle + */ +void zebra_index_rules_destroy(zebra_index_rules_t r); + + +/** \brief creates index rules handler/object from xml Doc + \param fname filename + \returns handle (NULL if unsuccessful) + + Similar to zebra_index_rules_create +*/ +zebra_index_rules_t zebra_index_rules_create_doc(xmlDocPtr doc); + + +/** \brief lookup of indexrule + \param r rules + \param id id to search for + \returns pattern ID +*/ +const char *zebra_index_rule_lookup_str(zebra_index_rules_t r, const char *id); + +YAZ_END_CDECL + +#endif +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/include/rob_regexp.h b/include/rob_regexp.h new file mode 100644 index 0000000..31281ae --- /dev/null +++ b/include/rob_regexp.h @@ -0,0 +1,46 @@ +/* $Id: rob_regexp.h,v 1.1 2007-10-23 12:26:25 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +/** + \brief Definitions for Zebra's index rules system +*/ + +#ifndef ZEBRA_ROB_REGEXP_H +#define ZEBRA_ROB_REGEXP_H + +#include + +YAZ_BEGIN_CDECL + +int zebra_rob_regexp(const char *regexp, const char *text); + +YAZ_END_CDECL + +#endif +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/util/Makefile.am b/util/Makefile.am index 617216f..620121a 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1,8 +1,9 @@ -## $Id: Makefile.am,v 1.33 2007-08-27 17:22:22 adam Exp $ +## $Id: Makefile.am,v 1.34 2007-10-23 12:26:26 adam Exp $ noinst_LTLIBRARIES = libidzebra-util.la -check_PROGRAMS = tstcharmap tstflock tstlockscope tstpass tstres +check_PROGRAMS = tstcharmap tstflock tstlockscope tstpass tstres \ + tst_index_rules TESTS = $(check_PROGRAMS) @@ -17,7 +18,7 @@ LDADD = libidzebra-util.la $(YAZLALIB) libidzebra_util_la_SOURCES = version.c zint.c res.c charmap.c zebramap.c \ passwddb.c zebra-lock.c dirent.c xpath.c atoi_zn.c snippet.c flock.c \ - attrfind.c exit.c it_key.c su_codec.c + attrfind.c exit.c it_key.c su_codec.c index_rules.c rob_regexp.c tstpass_SOURCES = tstpass.c @@ -29,6 +30,8 @@ tstlockscope_SOURCES = tstlockscope.c tstres_SOURCES = tstres.c +tst_index_rules_SOURCES = tst_index_rules.c + clean-local: -rm -rf *.LCK -rm -rf *.log diff --git a/util/index_rules.c b/util/index_rules.c new file mode 100644 index 0000000..8f2fad3 --- /dev/null +++ b/util/index_rules.c @@ -0,0 +1,196 @@ +/* $Id: index_rules.c,v 1.1 2007-10-23 12:26:26 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + + This file is part of the Zebra server. + + Zebra is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + Zebra is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with Zebra; see the file LICENSE.zebra. If not, write to the + Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. +*/ + +#include +#include +#include +#include + +#include "index_rules.h" +#include "rob_regexp.h" +#include +#include +#include + +struct zebra_index_rules_s { + WRBUF last_id; +#if YAZ_HAVE_XML2 + struct zebra_index_rule *rules; + struct zebra_index_rule *last_rule_match; + xmlDocPtr doc; +#endif +}; + +#if YAZ_HAVE_XML2 +struct zebra_index_rule { + const xmlNode *ptr; + const char *id; + const char *locale; + const char *position; + const char *alwaysmatches; + const char *firstinfield; + const char *sort; + struct zebra_index_rule *next; +}; + +struct zebra_index_rule *parse_index_rule(const xmlNode *ptr) +{ + struct _xmlAttr *attr; + struct zebra_index_rule *rule; + + rule = xmalloc(sizeof(*rule)); + rule->next = 0; + rule->ptr = ptr; + rule->locale = 0; + rule->id = 0; + rule->position = 0; + rule->alwaysmatches = 0; + rule->firstinfield = 0; + rule->sort = 0; + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr->children && attr->children->type == XML_TEXT_NODE) + { + if (!strcmp((const char *) attr->name, "id")) + rule->id = (const char *) attr->children->content; + else if (!strcmp((const char *) attr->name, "locale")) + rule->locale = (const char *) attr->children->content; + else if (!strcmp((const char *) attr->name, "position")) + rule->position = (const char *) attr->children->content; + else if (!strcmp((const char *) attr->name, "alwaysmatches")) + rule->alwaysmatches = (const char *) attr->children->content; + else if (!strcmp((const char *) attr->name, "firstinfield")) + rule->firstinfield = (const char *) attr->children->content; + else if (!strcmp((const char *) attr->name, "sort")) + rule->sort = (const char *) attr->children->content; + else + { + yaz_log(YLOG_WARN, "Unsupport attribute '%s' for indexrule", + attr->name); + xfree(rule); + return 0; + } + } + } + return rule; +} +/* YAZ_HAVE_XML2 */ +#endif + +zebra_index_rules_t zebra_index_rules_create(const char *fname) +{ + xmlDocPtr doc = xmlParseFile(fname); + if (!doc) + return 0; + return zebra_index_rules_create_doc(doc); +} + +zebra_index_rules_t zebra_index_rules_create_doc(xmlDocPtr doc) +{ +#if YAZ_HAVE_XML2 + zebra_index_rules_t r = xmalloc(sizeof(*r)); + struct zebra_index_rule **rp = &r->rules; + const xmlNode *top = xmlDocGetRootElement(doc); + + r->doc = doc; + r->last_rule_match = 0; + r->last_id = wrbuf_alloc(); + *rp = 0; + if (top && top->type == XML_ELEMENT_NODE + && !strcmp((const char *) top->name, "indexrules")) + { + const xmlNode *ptr = top->children; + for (; ptr; ptr = ptr->next) + { + if (ptr->type == XML_ELEMENT_NODE + && !strcmp((const char *) ptr->name, "indexrule")) + { + *rp = parse_index_rule(ptr); + if (!*rp) + { + zebra_index_rules_destroy(r); + return 0; + } + rp = &(*rp)->next; + } + } + } + else + { + zebra_index_rules_destroy(r); + r = 0; + } + return r; +#else + yaz_log(YLOG_WARN, "Cannot read index rules %s because YAZ is without XML " + "support", fname); + return 0; +/* YAZ_HAVE_XML2 */ +#endif +} + +void zebra_index_rules_destroy(zebra_index_rules_t r) +{ +#if YAZ_HAVE_XML2 + struct zebra_index_rule *rule; + while (r->rules) + { + rule = r->rules; + r->rules = rule->next; + xfree(rule); + } + xmlFreeDoc(r->doc); + +#endif + wrbuf_destroy(r->last_id); + xfree(r); +} + +const char *zebra_index_rule_lookup_str(zebra_index_rules_t r, const char *id) +{ +#if YAZ_HAVE_XML2 + if (r->last_rule_match && !strcmp(wrbuf_cstr(r->last_id), id)) + return r->last_rule_match->id; + else + { + struct zebra_index_rule *rule = r->rules; + + wrbuf_rewind(r->last_id); + wrbuf_puts(r->last_id, id); + while (rule && !zebra_rob_regexp(rule->id, id)) + rule = rule->next; + r->last_rule_match = rule; + if (rule) + return rule->id; + } +#endif + return 0; +} + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/util/rob_regexp.c b/util/rob_regexp.c new file mode 100644 index 0000000..042eb10 --- /dev/null +++ b/util/rob_regexp.c @@ -0,0 +1,88 @@ +/* $Id: rob_regexp.c,v 1.1 2007-10-23 12:26:26 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + + This file is part of the Zebra server. + + Zebra is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + Zebra is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with Zebra; see the file LICENSE.zebra. If not, write to the + Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. +*/ + +/** + \brief Rob Pike's regular expresion parser + + Taken verbatim from Beautiful code.. ANSIfied a bit. + */ + + +#include +#include +#include +#include + +#include "rob_regexp.h" +#include +#include +#include + +static int matchhere(const char *regexp, const char *text); +static int matchstar(int c, const char *regexp, const char *text); + +int zebra_rob_regexp(const char *regexp, const char *text) +{ + if (regexp[0] == '^') + return matchhere(regexp+1, text); + do + { + if (matchhere(regexp, text)) + return 1; + } + while (*text++); + return 0; +} + +static int matchhere(const char *regexp, const char *text) +{ + if (regexp[0] == '\0') + return 1; + if (regexp[1] == '*') + return matchstar(regexp[0], regexp+2, text); + if (regexp[0] == '$' && regexp[1] == '\0') + return *text == '\0'; + if (*text && (regexp[0] == '.' || regexp[0] == *text)) + return matchhere(regexp+1, text+1); + return 0; +} + +static int matchstar(int c, const char *regexp, const char *text) +{ + do + { + if (matchhere(regexp, text)) + return 1; + } + while (*text && (*text++ == c || c == '.')); + return 0; +} + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/util/tst_index_rules.c b/util/tst_index_rules.c new file mode 100644 index 0000000..710ff31 --- /dev/null +++ b/util/tst_index_rules.c @@ -0,0 +1,113 @@ +/* $Id: tst_index_rules.c,v 1.1 2007-10-23 12:26:26 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +#include +#include +#include +#include +#include + +const char *xml_str = +" " +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +" \n" +; + +int compare_lookup(zebra_index_rules_t r, const char *id, + const char *expected_id) +{ + const char *got_id = zebra_index_rule_lookup_str(r, id); + if (!got_id && !expected_id) + return 1; /* none expected */ + + if (got_id && expected_id && !strcmp(got_id, expected_id)) + return 1; + return 0; +} + +void tst1(void) +{ + xmlDocPtr doc = xmlParseMemory(xml_str, strlen(xml_str)); + + YAZ_CHECK(doc); + if (doc) + { + zebra_index_rules_t rules = zebra_index_rules_create_doc(doc); + YAZ_CHECK(rules); + + if (!rules) + return ; + + YAZ_CHECK(compare_lookup(rules, "title:s", "^.*:s$")); + YAZ_CHECK(compare_lookup(rules, "title:sx", 0)); + YAZ_CHECK(compare_lookup(rules, "title:Sx", 0)); + YAZ_CHECK(compare_lookup(rules, "any:w", "^.*:w$")); + YAZ_CHECK(compare_lookup(rules, "any:w:en", 0)); + YAZ_CHECK(compare_lookup(rules, "any:w:el", "^.*:w:el$")); + + { + int i, iter = 3333; + for (i = 0; i < iter; i++) + { + compare_lookup(rules, "title:s", "^.*:s$"); + compare_lookup(rules, "title:sx", 0); + compare_lookup(rules, "title:Sx", 0); + } + } + + zebra_index_rules_destroy(rules); + } +} + +int main(int argc, char **argv) +{ + YAZ_CHECK_INIT(argc, argv); + YAZ_CHECK_LOG(); + + tst1(); + + YAZ_CHECK_TERM; +} + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + -- 1.7.10.4