From 60b5f5ba6f34ef79b037eb8af1e2554d9842bb10 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 18 Jan 2007 14:45:05 +0000 Subject: [PATCH] Fixed bug #826: Weird looking chars for set of MARC records. Problematic record in test/marc8.marc . The record switched to MARC-8 mode 'Basic Arabic', then back to 'basic Hebrew' (and never back to Latin1/ASCII). --- src/marcdisp.c | 47 ++++++++++++----- src/siconv.c | 89 +++++++++++++++++++++++---------- test/.cvsignore | 5 +- test/marc8.chr | 1 + test/marc8.marc | 1 + test/marc8.xml | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++ test/marc8.xml.marc | 1 + 7 files changed, 241 insertions(+), 42 deletions(-) create mode 100644 test/marc8.chr create mode 100644 test/marc8.marc create mode 100644 test/marc8.xml create mode 100644 test/marc8.xml.marc diff --git a/src/marcdisp.c b/src/marcdisp.c index f77482f..02f3d3e 100644 --- a/src/marcdisp.c +++ b/src/marcdisp.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: marcdisp.c,v 1.43 2007-01-06 16:08:04 adam Exp $ + * $Id: marcdisp.c,v 1.44 2007-01-18 14:45:05 adam Exp $ */ /** @@ -125,6 +125,27 @@ NMEM yaz_marc_get_nmem(yaz_marc_t mt) return mt->nmem; } +static void marc_iconv_reset(yaz_marc_t mt, WRBUF wr) +{ + if (mt->iconv_cd) + { +#if 1 + char outbuf[12]; + size_t outbytesleft = sizeof(outbuf); + char *outp = outbuf; + size_t r = yaz_iconv(mt->iconv_cd, 0, 0, &outp, &outbytesleft); + if (r != (size_t) (-1)) + wrbuf_write(wr, outbuf, outp - outbuf); +#else + int pos = wr->pos; + wrbuf_iconv_puts(wr, mt->iconv_cd, " "); + if (pos != wr->pos) + wr->pos--; + yaz_iconv(mt->iconv_cd, 0, 0, 0, 0); +#endif + } +} + static int marc_exec_leader(const char *leader_spec, char *leader, size_t size); @@ -472,8 +493,7 @@ int yaz_marc_write_line(yaz_marc_t mt, WRBUF wr) wrbuf_iconv_puts(wr, mt->iconv_cd, " "); wrbuf_iconv_puts(wr, mt->iconv_cd, s->code_data + using_code_len); - wrbuf_iconv_puts(wr, mt->iconv_cd, " "); - wr->pos--; + marc_iconv_reset(mt, wr); } wrbuf_puts (wr, mt->endline_str); break; @@ -481,8 +501,7 @@ int yaz_marc_write_line(yaz_marc_t mt, WRBUF wr) wrbuf_printf(wr, "%s", n->u.controlfield.tag); wrbuf_iconv_puts(wr, mt->iconv_cd, " "); wrbuf_iconv_puts(wr, mt->iconv_cd, n->u.controlfield.data); - wrbuf_iconv_puts(wr, mt->iconv_cd, " "); - wr->pos--; + marc_iconv_reset(mt, wr); wrbuf_puts (wr, mt->endline_str); break; case YAZ_MARC_COMMENT: @@ -591,6 +610,7 @@ static int yaz_marc_write_marcxml_ns1(yaz_marc_t mt, WRBUF wr, wrbuf_iconv_write_cdata(wr, mt->iconv_cd, s->code_data + using_code_len, strlen(s->code_data + using_code_len)); + marc_iconv_reset(mt, wr); wrbuf_iconv_puts(wr, mt->iconv_cd, ""); wrbuf_puts(wr, "\n"); } @@ -602,6 +622,8 @@ static int yaz_marc_write_marcxml_ns1(yaz_marc_t mt, WRBUF wr, strlen(n->u.controlfield.tag)); wrbuf_iconv_puts(wr, mt->iconv_cd, "\">"); wrbuf_iconv_puts(wr, mt->iconv_cd, n->u.controlfield.data); + + marc_iconv_reset(mt, wr); wrbuf_iconv_puts(wr, mt->iconv_cd, ""); wrbuf_puts(wr, "\n"); break; @@ -746,7 +768,7 @@ int yaz_marc_write_xml(yaz_marc_t mt, xmlNode **root_ptr, wrbuf_rewind(wr_cdata); wrbuf_iconv_puts(wr_cdata, mt->iconv_cd, s->code_data + using_code_len); - + marc_iconv_reset(mt, wr_cdata); ptr_subfield = xmlNewTextChild( ptr, ns_record, BAD_CAST "subfield", BAD_CAST wrbuf_cstr(wr_cdata)); @@ -761,7 +783,8 @@ int yaz_marc_write_xml(yaz_marc_t mt, xmlNode **root_ptr, case YAZ_MARC_CONTROLFIELD: wrbuf_rewind(wr_cdata); wrbuf_iconv_puts(wr_cdata, mt->iconv_cd, n->u.controlfield.data); - + marc_iconv_reset(mt, wr_cdata); + ptr = xmlNewTextChild(record_ptr, ns_record, BAD_CAST "controlfield", BAD_CAST wrbuf_cstr(wr_cdata)); @@ -833,6 +856,7 @@ int yaz_marc_write_iso2709(yaz_marc_t mt, WRBUF wr) /* write dummy IDFS + content */ wrbuf_iconv_putchar(wr_data_tmp, mt->iconv_cd, ' '); wrbuf_iconv_puts(wr_data_tmp, mt->iconv_cd, s->code_data); + marc_iconv_reset(mt, wr_data_tmp); } /* write dummy FS (makes MARC-8 to become ASCII) */ wrbuf_iconv_putchar(wr_data_tmp, mt->iconv_cd, ' '); @@ -844,6 +868,7 @@ int yaz_marc_write_iso2709(yaz_marc_t mt, WRBUF wr) wrbuf_rewind(wr_data_tmp); wrbuf_iconv_puts(wr_data_tmp, mt->iconv_cd, n->u.controlfield.data); + marc_iconv_reset(mt, wr_data_tmp); wrbuf_iconv_putchar(wr_data_tmp, mt->iconv_cd, ' ');/* field sep */ data_length += wrbuf_len(wr_data_tmp); break; @@ -895,17 +920,13 @@ int yaz_marc_write_iso2709(yaz_marc_t mt, WRBUF wr) { wrbuf_putc(wr, ISO2709_IDFS); wrbuf_iconv_puts(wr, mt->iconv_cd, s->code_data); - /* write dummy blank - makes MARC-8 to become ASCII */ - wrbuf_iconv_putchar(wr, mt->iconv_cd, ' '); - wr->pos--; + marc_iconv_reset(mt, wr); } wrbuf_putc(wr, ISO2709_FS); break; case YAZ_MARC_CONTROLFIELD: wrbuf_iconv_puts(wr, mt->iconv_cd, n->u.controlfield.data); - /* write dummy blank - makes MARC-8 to become ASCII */ - wrbuf_iconv_putchar(wr, mt->iconv_cd, ' '); - wr->pos--; + marc_iconv_reset(mt, wr); wrbuf_putc(wr, ISO2709_FS); break; case YAZ_MARC_COMMENT: diff --git a/src/siconv.c b/src/siconv.c index b040729..1cd81ed 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.32 2007-01-03 08:42:15 adam Exp $ + * $Id: siconv.c,v 1.33 2007-01-18 14:45:05 adam Exp $ */ /** * \file siconv.c @@ -85,6 +85,8 @@ struct yaz_iconv_struct { size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft, int last); + size_t (*flush_handle)(yaz_iconv_t cd, + char **outbuf, size_t *outbytesleft); int marc8_esc_mode; int comb_offset; @@ -880,6 +882,23 @@ static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x, return 0; } +static size_t yaz_flush_marc8(yaz_iconv_t cd, + char **outbuf, size_t *outbytesleft) +{ + if (strcmp(cd->write_marc8_page_chr, "\033(B")) + { + if (*outbytesleft < 3) + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t) (-1); + } + memcpy(*outbuf, "\033(B", 3); + (*outbuf) += 3; + *outbytesleft -= 3; + } + return 0; +} + static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x, char **outbuf, size_t *outbytesleft, int last) @@ -951,6 +970,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->write_handle = 0; cd->read_handle = 0; cd->init_handle = 0; + cd->flush_handle = 0; cd->my_errno = YAZ_ICONV_UNKNOWN; /* a useful hack: if fromcode has leading @, @@ -988,9 +1008,15 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) else if (!yaz_matchstr(tocode, "UCS4LE")) cd->write_handle = yaz_write_UCS4LE; else if (!yaz_matchstr(tocode, "MARC8")) + { cd->write_handle = yaz_write_marc8; + cd->flush_handle = yaz_flush_marc8; + } else if (!yaz_matchstr(tocode, "MARC8s")) + { cd->write_handle = yaz_write_marc8; + cd->flush_handle = yaz_flush_marc8; + } #if HAVE_WCHAR_H else if (!yaz_matchstr(tocode, "WCHAR_T")) cd->write_handle = yaz_write_wchar_t; @@ -1021,7 +1047,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { - char *inbuf0; + char *inbuf0 = 0; size_t r = 0; #if HAVE_ICONV_H @@ -1049,17 +1075,30 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, return r; } #endif - if (inbuf == 0 || *inbuf == 0) + + if (inbuf) + inbuf0 = *inbuf; + + if (cd->init_flag) { - cd->init_flag = 1; cd->my_errno = YAZ_ICONV_UNKNOWN; - return 0; + cd->marc8_esc_mode = 'B'; + + cd->comb_offset = cd->comb_size = 0; + cd->compose_char = 0; + + cd->write_marc8_comb_no = 0; + cd->write_marc8_second_half_char = 0; + cd->write_marc8_last = 0; + cd->write_marc8_page_chr = "\033(B"; + + cd->unget_x = 0; + cd->no_read_x = 0; } - inbuf0 = *inbuf; if (cd->init_flag) { - if (cd->init_handle) + if (cd->init_handle && inbuf && *inbuf) { size_t no_read = 0; size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf, @@ -1074,32 +1113,26 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, *inbytesleft -= no_read; *inbuf += no_read; } - cd->marc8_esc_mode = 'B'; - - cd->comb_offset = cd->comb_size = 0; - cd->compose_char = 0; - - cd->write_marc8_comb_no = 0; - cd->write_marc8_second_half_char = 0; - cd->write_marc8_last = 0; - cd->write_marc8_page_chr = "\033(B"; - - cd->init_flag = 0; - cd->unget_x = 0; - cd->no_read_x = 0; } + cd->init_flag = 0; + while (1) { unsigned long x; size_t no_read; - if (*inbytesleft == 0) + if (cd->unget_x) { - r = *inbuf - inbuf0; - break; + x = cd->unget_x; + no_read = cd->no_read_x; } - if (!cd->unget_x) + else if (inbuf && *inbuf) { + if (*inbytesleft == 0) + { + r = *inbuf - inbuf0; + break; + } x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft, &no_read); if (no_read == 0) @@ -1110,8 +1143,12 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, } else { - x = cd->unget_x; - no_read = cd->no_read_x; + r = 0; + if (cd->flush_handle && outbuf && *outbuf) + r = (*cd->flush_handle)(cd, outbuf, outbytesleft); + if (r == 0) + cd->init_flag = 1; + break; } if (x) { diff --git a/test/.cvsignore b/test/.cvsignore index bae48d8..78ef8fa 100644 --- a/test/.cvsignore +++ b/test/.cvsignore @@ -24,6 +24,5 @@ tst_retrieval tst_tpath tst_timing tst_timing.log -nfatest1 -nfaxmltest1 -nfaxmltest1.log +tst_comstack +tst_comstack.log diff --git a/test/marc8.chr b/test/marc8.chr new file mode 100644 index 0000000..f51f8e4 --- /dev/null +++ b/test/marc8.chr @@ -0,0 +1 @@ +marc-8 diff --git a/test/marc8.marc b/test/marc8.marc new file mode 100644 index 0000000..6e4e6ef --- /dev/null +++ b/test/marc8.marc @@ -0,0 +1 @@ +02647nam^^2200469^^^4500001001400000005001700014006001900031007001500050008004100065035001500106037009000121040004500211100002300256245028100279246006300560246006300623260014400686300001100830500001300841500005700854500009300911500002901004500005701033510002401090530004201114533008901156655002901245650004301274700003601317700002801353793003101381852001501412852001401427856015401441856015401595852003001749852002801779856015401807856014901961830004702110901002002157UCD-00259230120061209034435.0m d cr bn |||a|bb|920330s1583 enk s 000 0 eng d a99851339eo aCL0036000039bProQuest Information and Learning. 300 N. Zeeb Rd., Ann Arbor, MI 48106 aCu-RivEScCu-RivESdCStRLINedcrbdWaOLN1 aClinton, Atkinson.00aClinton, Purser & Arnold, to their countreymen wheresoeuerh[electronic resource] :bWherein is described by their own hands their vnfeigned penitence for their offences past: their patience in welcoming their death, & their duetiful minds towardes her most excellent Maiestie2 aClinton, Purser & Arnold, to their countreymen wheresoever2 aClinton, Purser & Arnold, to their countreymen wheresoever aLondon :bImprinted by Iohn Wolfe and are to be sold [by W. Wright] at the middle shop in the Poultry, ioyning S. Mildreds Church,c[1583?] a[12] p aIn verse aThe first poem is signed: Thomas Walton alias Purser aClinton's full name and bookseller's name from, and publication date conjectured by, STC aSignatures: Ap4(B Bp2 aReproduction of the original in the Bodleian Library4 aSTC (2nd ed.)c5431 aAlso issued in print and on microform aElectronic reproduction.nMode of access: World Wide Web.nRestricted to UC campuses 7aElectronic texts.2local 0aPirateszEnglandvEarly works to 1800.1 aWalton, Thomas,dfl. 1583.4aut1 aArnold,dfl. 1583.4aut0 aEarly English books online aMERbkmain aSCBbnnet40zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:1661040zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 aDVXLbELECT-GENhInternet aLAGEbin3Online access40zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:1661040zRestricted to UC campusesuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610xCDL 0aEarly English books, 1475-1640 ;v1406:13. aDVXLb002592301 \ No newline at end of file diff --git a/test/marc8.xml b/test/marc8.xml new file mode 100644 index 0000000..bb73af0 --- /dev/null +++ b/test/marc8.xml @@ -0,0 +1,139 @@ + + 02647nam^a2200469^^^4500 + UCD-002592301 + 20061209034435.0 + m d + cr bn |||a|bb| + 920330s1583 enk s 000 0 eng d + + 99851339eo + + + CL0036000039 + ProQuest Information and Learning. 300 N. Zeeb Rd., Ann Arbor, MI 48106 + + + Cu-RivES + Cu-RivES + CStRLIN + dcrb + WaOLN + + + Clinton, Atkinson. + + + Clinton, Purser & Arnold, to their countreymen wheresoeuer + [electronic resource] : + Wherein is described by their own hands their vnfeigned penitence for their offences past: their patience in welcoming their death, & their duetiful minds towardes her most excellent Maiestie + + + Clinton, Purser & Arnold, to their countreymen wheresoever + + + Clinton, Purser & Arnold, to their countreymen wheresoever + + + London : + Imprinted by Iohn Wolfe and are to be sold [by W. Wright] at the middle shop in the Poultry, ioyning S. Mildreds Church, + [1583?] + + + [12] p + + + In verse + + + The first poem is signed: Thomas Walton alias Purser + + + Clinton's full name and bookseller's name from, and publication date conjectured by, STC + + + Signatures: A⁴ B² + + + Reproduction of the original in the Bodleian Library + + + STC (2nd ed.) + 5431 + + + Also issued in print and on microform + + + Electronic reproduction. + Mode of access: World Wide Web. + Restricted to UC campuses + + + Electronic texts. + local + + + Pirates + England + Early works to 1800. + + + Walton, Thomas, + fl. 1583. + aut + + + Arnold, + fl. 1583. + aut + + + Early English books online + + + MER + kmain + + + SCB + nnet + + + Restricted to UC campuses + SCP UCSD + http://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 + + + Restricted to UC campuses + SCP UCSD + http://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 + + + DVXL + ELECT-GEN + Internet + + + LAGE + in + Online access + + + Restricted to UC campuses + SCP UCSD + http://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 + + + Restricted to UC campuses + http://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 + CDL + + + Early English books, 1475-1640 ; + 1406:13. + + + DVXL + 002592301 + + diff --git a/test/marc8.xml.marc b/test/marc8.xml.marc new file mode 100644 index 0000000..7a433b0 --- /dev/null +++ b/test/marc8.xml.marc @@ -0,0 +1 @@ +02643nam^a2200469^^^4500001001400000005001700014006001900031007001500050008004100065035001500106037009000121040004500211100002300256245028100279246006300560246006300623260014400686300001100830500001300841500005700854500009300911500002501004500005701029510002401086530004201110533008901152655002901241650004301270700003601313700002801349793003101377852001501408852001401423856015401437856015401591852003001745852002801775856015401803856014901957830004702106901002002153UCD-00259230120061209034435.0m d cr bn |||a|bb|920330s1583 enk s 000 0 eng d a99851339eo aCL0036000039bProQuest Information and Learning. 300 N. Zeeb Rd., Ann Arbor, MI 48106 aCu-RivEScCu-RivESdCStRLINedcrbdWaOLN1 aClinton, Atkinson.00aClinton, Purser & Arnold, to their countreymen wheresoeuerh[electronic resource] :bWherein is described by their own hands their vnfeigned penitence for their offences past: their patience in welcoming their death, & their duetiful minds towardes her most excellent Maiestie2 aClinton, Purser & Arnold, to their countreymen wheresoever2 aClinton, Purser & Arnold, to their countreymen wheresoever aLondon :bImprinted by Iohn Wolfe and are to be sold [by W. Wright] at the middle shop in the Poultry, ioyning S. Mildreds Church,c[1583?] a[12] p aIn verse aThe first poem is signed: Thomas Walton alias Purser aClinton's full name and bookseller's name from, and publication date conjectured by, STC aSignatures: A⁴ B² aReproduction of the original in the Bodleian Library4 aSTC (2nd ed.)c5431 aAlso issued in print and on microform aElectronic reproduction.nMode of access: World Wide Web.nRestricted to UC campuses 7aElectronic texts.2local 0aPirateszEnglandvEarly works to 1800.1 aWalton, Thomas,dfl. 1583.4aut1 aArnold,dfl. 1583.4aut0 aEarly English books online aMERbkmain aSCBbnnet40zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:1661040zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610 aDVXLbELECT-GENhInternet aLAGEbin3Online access40zRestricted to UC campusesxSCP UCSDuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:1661040zRestricted to UC campusesuhttp://gateway.proquest.com/openurl?ctx_ver=Z39.88-2003&res_id=xri:eebo&rft_val_fmt=&rft_id=xri:eebo:image:16610xCDL 0aEarly English books, 1475-1640 ;v1406:13. aDVXLb002592301 \ No newline at end of file -- 1.7.10.4