From 60fa7941321b3d313bfa0d94e188835a35e93320 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Sun, 17 Dec 2006 15:34:11 +0000 Subject: [PATCH] Fixed bug #775: char conversion does not handle alternative UTF-8 sequences. --- NEWS | 3 +++ src/charconv.tcl | 17 ++++++++++++++++- test/Makefile.am | 3 ++- test/marccol2.u8.1.lst | 28 ++++++++++++++++++++++++++++ test/marccol2.u8.2.lst | 28 ++++++++++++++++++++++++++++ test/marccol2.u8.marc | 1 + 6 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 test/marccol2.u8.1.lst create mode 100644 test/marccol2.u8.2.lst create mode 100644 test/marccol2.u8.marc diff --git a/NEWS b/NEWS index 02c112e..d98474c 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +Fixed bug #775: char conversion does not handle Alternative UTF-8 +sequences. + Implemented function yaz_marc_read_line which parses MARC line format records. There is a wealth of formats out there. So far, this function reads line records produced by yaz_marc_write_line. diff --git a/src/charconv.tcl b/src/charconv.tcl index b509a8d..7876f93 100755 --- a/src/charconv.tcl +++ b/src/charconv.tcl @@ -2,7 +2,7 @@ # the next line restarts using tclsh \ if [ -f /usr/local/bin/tclsh8.4 ]; then exec tclsh8.4 "$0" "$@"; else exec tclsh "$0" "$@"; fi # -# $Id: charconv.tcl,v 1.17 2006-08-30 20:40:18 adam Exp $ +# $Id: charconv.tcl,v 1.18 2006-12-17 15:34:11 adam Exp $ proc usage {} { puts {charconv.tcl: [-p prefix] [-s split] [-o ofile] file ... } @@ -268,12 +268,14 @@ proc readfile {fname ofilehandle prefix omits reverse} { set marc_lines 0 set ucs_lines 0 set utf_lines 0 + set altutf_lines 0 set codename_lines 0 set lineno 0 set f [open $fname r] set tablenumber x set combining 0 set codename {} + set altutf {} while {1} { incr lineno set cnt [gets $f line] @@ -305,6 +307,7 @@ proc readfile {fname ofilehandle prefix omits reverse} { # puts "ins_trie $hex $marc ins_trie $hex $marc $combining $codename unset hex + } else { for {set i 0} {$i < [string length $marc]} {incr i 2} { lappend hex [string range $marc $i [expr $i+1]] @@ -314,10 +317,20 @@ proc readfile {fname ofilehandle prefix omits reverse} { unset hex } } + if {$reverse && [string length $marc]} { + for {set i 0} {$i < [string length $altutf]} {incr i 2} { + lappend hex [string range $altutf $i [expr $i+1]] + } + if {[info exists hex]} { + ins_trie $hex $marc $combining $codename + unset hex + } + } set marc {} set uni {} set codename {} set combining 0 + set altutf {} } elseif {[regexp {([0-9A-Fa-f]*)} $line s marc]} { incr marc_lines } elseif {[regexp {(.*)} $line s codename]} { @@ -338,6 +351,8 @@ proc readfile {fname ofilehandle prefix omits reverse} { incr ucs_lines } elseif {[regexp {([0-9A-Fa-f]*)} $line s utf]} { incr utf_lines + } elseif {[regexp {([0-9A-Fa-f]*)} $line s altutf]} { + incr altutf_lines } } close $f diff --git a/test/Makefile.am b/test/Makefile.am index 1d6395c..803e0f0 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,6 +1,6 @@ ## Copyright (C) 1994-2006, Index Data ApS ## All rights reserved. -## $Id: Makefile.am,v 1.27 2006-11-29 12:48:59 heikki Exp $ +## $Id: Makefile.am,v 1.28 2006-12-17 15:34:11 adam Exp $ check_PROGRAMS = tsticonv tstnmem tstmatchstr tstwrbuf tstodr tstccl tstlog \ tstsoap1 tstsoap2 tstodrstack tstlogthread tstxmlquery tstpquery \ @@ -20,6 +20,7 @@ EXTRA_DIST = tstodr.asn tstodrcodec.c tstodrcodec.h cqlsample \ marc6.marc marc6.xml marc6.chr marc6.xml.marc \ marc7.marc marc7.xml marc7.chr marc7.xml.marc \ marccol1.u8.marc marccol1.u8.1.lst marccol1.u8.2.lst \ + marccol2.u8.marc marccol2.u8.1.lst marccol2.u8.2.lst \ tst_record_conv.xsl YAZCOMP = ../util/yaz-asncomp diff --git a/test/marccol2.u8.1.lst b/test/marccol2.u8.1.lst new file mode 100644 index 0000000..2a915fa --- /dev/null +++ b/test/marccol2.u8.1.lst @@ -0,0 +1,28 @@ +03103cam a2200337 i 4500 +001 12683849 +005 20051218154744.0 +008 981008b2001 ilu 000 0 eng +035 $a 57779 +035 $a 90490 +035 $a 93202 +040 $a DLC $c DLC +906 $a 0 $b und $c orignew $d u $e ncip $f 19 $g y-gencatlg +010 $a 77123332 +245 00 $a Voyager Diacritic test -- New input 001 (SBIE) +260 $a ny $b ny, $c 2001. +300 $a p. $c cm. +500 $a New copy imported from file (8/12/99) +500 $a VOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡) +500 $a VOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·) +500 $a VOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʾ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł) +500 $a VOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð) +500 $a VOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³) +500 $a VOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽); +500 $a VOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇) +500 $a VOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó) +500 $a VOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o︠) +500 $a VOYAGER COLUMN 9: Ligature right (o︡) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥) +500 $a VOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o︢); Double Tilde 2nd half (o︣) ; High Comma centered (o̓) +500 $a VOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~) +500 $a Standard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>? + diff --git a/test/marccol2.u8.2.lst b/test/marccol2.u8.2.lst new file mode 100644 index 0000000..27319c2 --- /dev/null +++ b/test/marccol2.u8.2.lst @@ -0,0 +1,28 @@ +03093cam a2200337 i 4500 +001 12683849 +005 20051218154744.0 +008 981008b2001 ilu 000 0 eng +035 $a 57779 +035 $a 90490 +035 $a 93202 +040 $a DLC $c DLC +906 $a 0 $b und $c orignew $d u $e ncip $f 19 $g y-gencatlg +010 $a 77123332 +245 00 $a Voyager Diacritic test -- New input 001 (SBIE) +260 $a ny $b ny, $c 2001. +300 $a p. $c cm. +500 $a New copy imported from file (8/12/99) +500 $a VOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡) +500 $a VOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·) +500 $a VOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʼ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł) +500 $a VOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð) +500 $a VOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³) +500 $a VOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽); +500 $a VOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇) +500 $a VOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó) +500 $a VOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o͡) +500 $a VOYAGER COLUMN 9: Ligature right (o) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥) +500 $a VOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o͠); Double Tilde 2nd half (o) ; High Comma centered (o̓) +500 $a VOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~) +500 $a Standard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>? + diff --git a/test/marccol2.u8.marc b/test/marccol2.u8.marc new file mode 100644 index 0000000..28a97f0 --- /dev/null +++ b/test/marccol2.u8.marc @@ -0,0 +1 @@ +03103cam a2200337 i 45000010009000000050017000090080041000260350010000670350010000770350010000870400013000979060045001100100017001552450051001722600019002233000012002425000042002545000175002965000199004715000170006705000194008405000197010345000220012315000204014515000198016555000177018535000216020305000218022465000160024645000141026241268384920051218154744.0981008b2001 ilu 000 0 eng  a57779 a90490 a93202 aDLCcDLC a0bundcorignewduencipf19gy-gencatlg a 77123332 00aVoyager Diacritic test -- New input 001 (SBIE) anybny,c2001. ap.ccm. aNew copy imported from file (8/12/99) aVOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡) aVOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·) aVOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʾ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł) aVOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð) aVOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³) aVOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽); aVOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇) aVOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó) aVOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o︠) aVOYAGER COLUMN 9: Ligature right (o︡) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥) aVOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o︢); Double Tilde 2nd half (o︣) ; High Comma centered (o̓) aVOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~) aStandard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>? \ No newline at end of file -- 1.7.10.4