From a8f5308820cd689f5b92601038cb5d24118d5f2b Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Thu, 26 Apr 2007 21:33:32 +0000 Subject: [PATCH] started facturizing 7 bit ascii normalizing functions out of logic.c into normalization7bit.[hc] later to be replaced by ICU normalization still many bits need to be identified and moved --- src/Makefile.am | 3 +- src/logic.c | 82 +++------------------------------ src/normalize7bit.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/normalize7bit.h | 29 ++++++++++++ 4 files changed, 166 insertions(+), 76 deletions(-) create mode 100644 src/normalize7bit.c create mode 100644 src/normalize7bit.h diff --git a/src/Makefile.am b/src/Makefile.am index bd15eaf..53f51bb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,4 @@ -# $Id: Makefile.am,v 1.16 2007-04-25 13:14:46 marc Exp $ +# $Id: Makefile.am,v 1.17 2007-04-26 21:33:32 marc Exp $ bin_PROGRAMS = pazpar2 check_PROGRAMS = test_config \ @@ -23,6 +23,7 @@ libpazpar2_a_SOURCES = config.c config.h eventl.c eventl.h \ logic.c pazpar2.h \ record.h record.c reclists.c reclists.h \ relevance.c relevance.h termlists.c termlists.h \ + normalize7bit.h normalize7bit.c \ util.c util.h zeerex.c zeerex.h database.c database.h \ settings.h settings.c sel_thread.c sel_thread.h getaddrinfo.c \ client.c client.h connection.c connection.h host.h parameters.h diff --git a/src/logic.c b/src/logic.c index a3fdd49..23d60dc 100644 --- a/src/logic.c +++ b/src/logic.c @@ -1,4 +1,4 @@ -/* $Id: logic.c,v 1.24 2007-04-26 12:12:19 marc Exp $ +/* $Id: logic.c,v 1.25 2007-04-26 21:33:32 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -70,6 +70,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include "database.h" #include "client.h" #include "settings.h" +#include "normalize7bit.h" #define MAX_CHUNK 15 @@ -121,75 +122,6 @@ void pull_terms(NMEM nmem, struct ccl_rpn_node *n, char **termlist, int *num) } } -char *normalize_mergekey(char *buf, int skiparticle) -{ - char *p = buf, *pout = buf; - - if (skiparticle) - { - char firstword[64]; - char articles[] = "the den der die des an a "; // must end in space - - while (*p && !isalnum(*p)) - p++; - pout = firstword; - while (*p && *p != ' ' && pout - firstword < 62) - *(pout++) = tolower(*(p++)); - *(pout++) = ' '; - *(pout++) = '\0'; - if (!strstr(articles, firstword)) - p = buf; - pout = buf; - } - - while (*p) - { - while (*p && !isalnum(*p)) - p++; - while (isalnum(*p)) - *(pout++) = tolower(*(p++)); - if (*p) - *(pout++) = ' '; - while (*p && !isalnum(*p)) - p++; - } - if (buf != pout) - do { - *(pout--) = '\0'; - } - while (pout > buf && *pout == ' '); - - return buf; -} - -// Extract what appears to be years from buf, storing highest and -// lowest values. -static int extract_years(const char *buf, int *first, int *last) -{ - *first = -1; - *last = -1; - while (*buf) - { - const char *e; - int len; - - while (*buf && !isdigit(*buf)) - buf++; - len = 0; - for (e = buf; *e && isdigit(*e); e++) - len++; - if (len == 4) - { - int value = atoi(buf); - if (*first < 0 || value < *first) - *first = value; - if (*last < 0 || value > *last) - *last = value; - } - buf = e; - } - return *first; -} static void add_facet(struct session *s, const char *type, const char *value) @@ -291,9 +223,7 @@ xmlDoc *normalize_record(struct session_database *sdb, Z_External *rec) return 0; } } -#endif - -#if 0 +#else // do it another way to detect transformation errors right now // but does not seem to work either! { @@ -989,7 +919,7 @@ struct record *ingest_record(struct client *cl, Z_External *rec, mergekey_norm = (xmlChar *) nmem_strdup(se->nmem, (char*) mergekey); xmlFree(mergekey); - normalize_mergekey((char *) mergekey_norm, 0); + normalize7bit_mergekey((char *) mergekey_norm, 0); cluster = reclist_insert(se->reclist, global_parameters.server->service, @@ -1095,6 +1025,8 @@ struct record *ingest_record(struct client *cl, Z_External *rec, for (pe = p + strlen(p) - 1; pe > p && strchr(" ,/.:([", *pe); pe--) *pe = '\0'; + //char * normalize7bit_generic(char* str, char* rm_chars); + rec_md->data.text = nmem_strdup(se->nmem, p); } @@ -1142,7 +1074,7 @@ struct record *ingest_record(struct client *cl, Z_External *rec, cluster->sortkeys[sk_field_id] = nmem_malloc(se->nmem, sizeof(union data_types)); - normalize_mergekey(s, + normalize7bit_mergekey(s, (ser_sk->type == Metadata_sortkey_skiparticle)); cluster->sortkeys[sk_field_id]->text = s; } diff --git a/src/normalize7bit.c b/src/normalize7bit.c new file mode 100644 index 0000000..d79026b --- /dev/null +++ b/src/normalize7bit.c @@ -0,0 +1,128 @@ +/* $Id: normalize7bit.c,v 1.1 2007-04-26 21:33:32 marc Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + */ + +/** \file normalize7bit.c + \brief char and string normalization for 7bit ascii only +*/ + +#include +#include +#include + +#if HAVE_CONFIG_H +#include "cconfig.h" +#endif + +#include "normalize7bit.h" + + +char * normalize7bit_generic(char* str, char* rm_chars) +{ + char *p, *pe; + for (p = str; *p && isspace(*p); p++) + ; + for (pe = p + strlen(p) - 1; + pe > p && strchr(rm_chars, *pe); pe--) + *pe = '\0'; + return p; +} + + + +char * normalize7bit_mergekey(char *buf, int skiparticle) +{ + char *p = buf, *pout = buf; + + if (skiparticle) + { + char firstword[64]; + char articles[] = "the den der die des an a "; // must end in space + + while (*p && !isalnum(*p)) + p++; + pout = firstword; + while (*p && *p != ' ' && pout - firstword < 62) + *(pout++) = tolower(*(p++)); + *(pout++) = ' '; + *(pout++) = '\0'; + if (!strstr(articles, firstword)) + p = buf; + pout = buf; + } + + while (*p) + { + while (*p && !isalnum(*p)) + p++; + while (isalnum(*p)) + *(pout++) = tolower(*(p++)); + if (*p) + *(pout++) = ' '; + while (*p && !isalnum(*p)) + p++; + } + if (buf != pout) + do { + *(pout--) = '\0'; + } + while (pout > buf && *pout == ' '); + + return buf; +} + +// Extract what appears to be years from buf, storing highest and +// lowest values. +int extract_years(const char *buf, int *first, int *last) +{ + *first = -1; + *last = -1; + while (*buf) + { + const char *e; + int len; + + while (*buf && !isdigit(*buf)) + buf++; + len = 0; + for (e = buf; *e && isdigit(*e); e++) + len++; + if (len == 4) + { + int value = atoi(buf); + if (*first < 0 || value < *first) + *first = value; + if (*last < 0 || value > *last) + *last = value; + } + buf = e; + } + return *first; +} + + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/src/normalize7bit.h b/src/normalize7bit.h new file mode 100644 index 0000000..8b53c5c --- /dev/null +++ b/src/normalize7bit.h @@ -0,0 +1,29 @@ +/* $Id: normalize7bit.h,v 1.1 2007-04-26 21:33:32 marc Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + */ + +#ifndef NORMALIZE7BIT_H +#define NORMALIZE7BIT_H + +char *normalize7bit_mergekey(char *buf, int skiparticle); +int extract_years(const char *buf, int *first, int *last); + + +#endif -- 1.7.10.4