1 // $Id: AnselToUnicode.java,v 1.5 2008/10/17 06:47:06 haschart Exp $
\r
3 * Copyright (C) 2002 Bas Peters (mail@bpeters.com)
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
21 package org.marc4j.converter.impl;
\r
23 import java.io.InputStream;
\r
24 import java.lang.reflect.Constructor;
\r
25 import java.util.Vector;
\r
27 import org.marc4j.ErrorHandler;
\r
28 import org.marc4j.MarcException;
\r
29 import org.marc4j.converter.CharConverter;
\r
33 * A utility to convert MARC-8 data to non-precomposed UCS/Unicode.
\r
37 * The MARC-8 to Unicode mapping used is the version with the March 2005
\r
41 * @author Bas Peters
\r
42 * @author Corey Keith
\r
43 * @version $Revision: 1.5 $
\r
45 public class AnselToUnicode extends CharConverter {
\r
47 class Queue extends Vector {
\r
50 * Puts an item into the queue.
\r
53 * the item to be put into the queue.
\r
55 public Object put(Object item) {
\r
62 * Gets an item from the front of the queue.
\r
64 public Object get() {
\r
75 * Peeks at the front of the queue.
\r
77 public Object peek() {
\r
80 return elementAt(0);
\r
84 * Returns true if the queue is empty.
\r
86 public boolean empty() {
\r
100 public String toString() {
\r
101 return "Offset: " + offset + " G0: " + Integer.toHexString(g0)
\r
102 + " G1: " + Integer.toHexString(g1) + " Multibyte: "
\r
107 protected CodeTableInterface ct;
\r
109 protected boolean loadedMultibyte = false;
\r
111 protected ErrorHandler errorList = null;
\r
113 * Creates a new instance and loads the MARC4J supplied
\r
114 * conversion tables based on the official LC tables.
\r
117 public AnselToUnicode()
\r
119 ct = loadGeneratedTable(false);
\r
123 * Creates a new instance and loads the MARC4J supplied
\r
124 * conversion tables based on the official LC tables.
\r
127 public AnselToUnicode(boolean loadMultibyte)
\r
129 ct = loadGeneratedTable(loadMultibyte);
\r
132 * Creates a new instance and loads the MARC4J supplied
\r
133 * conversion tables based on the official LC tables.
\r
136 public AnselToUnicode(ErrorHandler errorList)
\r
138 ct = loadGeneratedTable(false);
\r
139 this.errorList = errorList;
\r
143 * Creates a new instance and loads the MARC4J supplied
\r
144 * conversion tables based on the official LC tables.
\r
147 public AnselToUnicode(ErrorHandler errorList, boolean loadMultibyte)
\r
149 ct = loadGeneratedTable(loadMultibyte);
\r
150 this.errorList = errorList;
\r
154 private CodeTableInterface loadGeneratedTable(boolean loadMultibyte)
\r
158 Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated");
\r
159 Constructor cons = generated.getConstructor();
\r
160 Object ct = cons.newInstance();
\r
161 loadedMultibyte = true;
\r
162 return((CodeTableInterface)ct);
\r
164 catch (Exception e)
\r
166 CodeTableInterface ct;
\r
169 ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml"));
\r
173 ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml"));
\r
175 loadedMultibyte = loadMultibyte;
\r
182 * Constructs an instance with the specified pathname.
\r
184 * Use this constructor to create an instance with a customized code table
\r
185 * mapping. The mapping file should follow the structure of LC's XML MARC-8
\r
186 * to Unicode mapping (see:
\r
187 * http://www.loc.gov/marc/specifications/codetables.xml).
\r
190 public AnselToUnicode(String pathname) {
\r
191 ct = new CodeTable(pathname);
\r
192 loadedMultibyte = true;
\r
196 * Constructs an instance with the specified input stream.
\r
198 * Use this constructor to create an instance with a customized code table
\r
199 * mapping. The mapping file should follow the structure of LC's XML MARC-8
\r
200 * to Unicode mapping (see:
\r
201 * http://www.loc.gov/marc/specifications/codetables.xml).
\r
204 public AnselToUnicode(InputStream in) {
\r
205 ct = new CodeTable(in);
\r
206 loadedMultibyte = true;
\r
210 * Loads the entire mapping (including multibyte characters) from the Library
\r
213 private void loadMultibyte() {
\r
214 ct = new CodeTable(getClass().getResourceAsStream(
\r
215 "resources/codetables.xml"));
\r
218 private void checkMode(char[] data, CodeTracker cdt) {
\r
222 while (cdt.offset + extra + extra2< data.length && isEscape(data[cdt.offset])) {
\r
223 switch (data[cdt.offset + 1 + extra]) {
\r
226 set_cdt(cdt, 0, data, 2 + extra, false);
\r
230 set_cdt(cdt, 1, data, 2 + extra, false);
\r
233 if (!loadedMultibyte) {
\r
235 loadedMultibyte = true;
\r
237 switch (data[cdt.offset + 2 + extra + extra2]) {
\r
240 set_cdt(cdt, 1, data, 3 + extra + extra2, true);
\r
243 set_cdt(cdt, 0, data, 3 + extra + extra2, true);
\r
246 cdt.g0 = data[cdt.offset + 2 + extra + extra2];
\r
247 cdt.offset += 3 + extra + extra2;
\r
248 cdt.multibyte = true;
\r
251 // space found in escape code: look ahead and try to proceed
\r
255 // unknown code character found: discard escape sequence and return
\r
257 if (errorList != null)
\r
259 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
263 throw new MarcException("Unknown character set code found following escape character.");
\r
271 cdt.g0 = data[cdt.offset + 1 + extra];
\r
272 cdt.offset += 2 + extra;
\r
273 cdt.multibyte = false;
\r
277 cdt.offset += 2 + extra;
\r
278 cdt.multibyte = false;
\r
281 // space found in escape code: look ahead and try to proceed
\r
282 if (errorList == null)
\r
284 throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
\r
289 // unknown code character found: discard escape sequence and return
\r
291 if (errorList != null)
\r
293 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
297 throw new MarcException("Unknown character set code found following escape character.");
\r
302 if (errorList != null && ( extra != 0 || extra2 != 0))
\r
304 errorList.addError(ErrorHandler.ERROR_TYPO, "" + (extra+extra2) + " extraneous space characters found within MARC8 character set escape sequence");
\r
308 private void set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte)
\r
310 if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E')
\r
314 else if (data[cdt.offset + addnlOffset] == ' ')
\r
316 if (errorList != null)
\r
318 errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space.");
\r
322 throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
\r
326 else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1)
\r
328 if (errorList != null)
\r
330 errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character.");
\r
334 throw new MarcException("Extraneaous intermediate character found following escape character.");
\r
338 if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1)
\r
341 cdt.multibyte = false;
\r
342 if (errorList != null)
\r
344 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
348 throw new MarcException("Unknown character set code found following escape character.");
\r
351 else // All is well, proceed normally
\r
353 if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset];
\r
354 else cdt.g1 = data[cdt.offset + addnlOffset];
\r
355 cdt.offset += 1 + addnlOffset;
\r
356 cdt.multibyte = multibyte;
\r
361 * Converts MARC-8 data to UCS/Unicode.
\r
364 * @param data - the MARC-8 data in an array of char
\r
365 * @return String - the UCS/Unicode data
\r
367 public String convert(char data[])
\r
369 StringBuffer sb = new StringBuffer();
\r
370 int len = data.length;
\r
372 CodeTracker cdt = new CodeTracker();
\r
376 cdt.multibyte = false;
\r
380 checkMode(data, cdt);
\r
382 Queue diacritics = new Queue();
\r
384 while (cdt.offset < data.length)
\r
386 if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
\r
387 && hasNext(cdt.offset, len))
\r
390 while (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
\r
391 && hasNext(cdt.offset, len))
\r
393 char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
394 if (c != 0) diacritics.put(new Character(c));
\r
396 checkMode(data, cdt);
\r
399 char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
401 checkMode(data, cdt);
\r
402 if (c2 != 0) sb.append(c2);
\r
404 while (!diacritics.isEmpty())
\r
406 char c1 = ((Character) diacritics.get()).charValue();
\r
411 else if (cdt.multibyte)
\r
413 if (data[cdt.offset]== 0x20)
\r
415 // if a 0x20 byte occurs amidst a sequence of multibyte characters
\r
416 // skip over it and output a space.
\r
417 // Hmmm. If the following line is present it seems to output two spaces
\r
418 // when a space occurs in multibytes chars, without it one seems to be output.
\r
419 // sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1));
\r
422 else if (cdt.offset + 3 <= data.length && (errorList == null || data[cdt.offset+1]!= 0x20 && data[cdt.offset+2]!= 0x20))
\r
424 char c = getMBChar(makeMultibyte(data[cdt.offset], data[cdt.offset+1], data[cdt.offset+2]));
\r
425 if (errorList == null || c != 0)
\r
430 else if (cdt.offset + 6 <= data.length && data[cdt.offset+4]!= 0x20 && data[cdt.offset+5]!= 0x20 &&
\r
431 getMBChar(makeMultibyte(data[cdt.offset+3], data[cdt.offset+4], data[cdt.offset+5])) != 0)
\r
433 if (errorList != null)
\r
435 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
\r
440 else if (cdt.offset + 4 <= data.length && data[cdt.offset] > 0x7f &&
\r
441 getMBChar(makeMultibyte(data[cdt.offset+1], data[cdt.offset+2], data[cdt.offset+3])) != 0)
\r
443 if (errorList != null)
\r
445 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters");
\r
446 sb.append(getChar(data[cdt.offset], 0x42, 0x45));
\r
452 if (errorList != null)
\r
454 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
\r
456 cdt.multibyte = false;
\r
461 else if (errorList != null && cdt.offset + 4 <= data.length && ( data[cdt.offset+1] == 0x20 || data[cdt.offset+2]== 0x20))
\r
463 int multiByte = makeMultibyte( data[cdt.offset], ((data[cdt.offset+1] != 0x20)? data[cdt.offset+1] : data[cdt.offset+2]), data[cdt.offset+3]);
\r
464 char c = getMBChar(multiByte);
\r
467 if (errorList != null)
\r
469 errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character");
\r
477 if (errorList != null)
\r
479 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
\r
481 cdt.multibyte = false;
\r
486 else if (cdt.offset + 3 > data.length)
\r
488 if (errorList != null)
\r
490 errorList.addError(ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set");
\r
491 cdt.multibyte = false;
\r
495 // if a field ends with an incomplete encoding of a multibyte character
\r
496 // simply discard that final partial character.
\r
505 char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
506 if (c != 0) sb.append(c);
\r
509 String val = "0000"+Integer.toHexString((int)(data[cdt.offset]));
\r
510 sb.append("<U+"+ (val.substring(val.length()-4, val.length()))+ ">" );
\r
514 if (hasNext(cdt.offset, len))
\r
516 checkMode(data, cdt);
\r
519 return sb.toString();
\r
522 private int makeMultibyte(char[] data) {
\r
523 int[] chars = new int[3];
\r
524 chars[0] = data[0] << 16;
\r
525 chars[1] = data[1] << 8;
\r
526 chars[2] = data[2];
\r
527 return chars[0] | chars[1] | chars[2];
\r
530 public int makeMultibyte(char c1, char c2, char c3)
\r
532 int[] chars = new int[3];
\r
533 chars[0] = c1 << 16;
\r
534 chars[1] = c2 << 8;
\r
536 return chars[0] | chars[1] | chars[2];
\r
539 private char getChar(int ch, int g0, int g1) {
\r
541 return ct.getChar(ch, g0);
\r
543 return ct.getChar(ch, g1);
\r
546 public char getMBChar(int ch) {
\r
547 return ct.getChar(ch, 0x31);
\r
550 private static boolean hasNext(int pos, int len) {
\r
551 if (pos < (len - 1))
\r
556 private static boolean isEscape(int i) {
\r