From a2ab873d02a33623c5e110bd8ba6bddebae6765b Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Wed, 18 Jan 2012 12:53:35 +0100 Subject: [PATCH] Turbo Marc Writer --- src/org/marc4j/TurboMarcXmlWriter.java | 572 ++++++++++++++++++++++++++++++++ 1 file changed, 572 insertions(+) create mode 100644 src/org/marc4j/TurboMarcXmlWriter.java diff --git a/src/org/marc4j/TurboMarcXmlWriter.java b/src/org/marc4j/TurboMarcXmlWriter.java new file mode 100644 index 0000000..789410d --- /dev/null +++ b/src/org/marc4j/TurboMarcXmlWriter.java @@ -0,0 +1,572 @@ +//$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $ +/** + * Copyright (C) 2004 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.marc4j; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.util.Iterator; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Result; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import org.marc4j.converter.CharConverter; +import org.marc4j.marc.ControlField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Leader; +import org.marc4j.marc.Record; +import org.marc4j.marc.Subfield; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import com.ibm.icu.text.Normalizer; + +/** + * Class for writing MARC record objects in MARCXML format. This class outputs a + * SAX event stream to the given {@link java.io.OutputStream}  or + * {@link javax.xml.transform.Result} object. It can be used in a SAX + * pipeline to postprocess the result. By default this class uses a nulll + * transform. It is strongly recommended to use a dedicated XML serializer. + * + *

+ * This class requires a JAXP compliant XML parser and XSLT processor. The + * underlying SAX2 parser should be namespace aware. In addition this class + * requires ICU4J to perform Unicode + * normalization. A stripped down version of 2.6 originating from the XOM project is included in this + * distribution. + *

+ *

+ * The following example reads a file with MARC records and writes MARCXML + * records in UTF-8 encoding to the console: + *

+ * + *
+ *  
+ *      InputStream input = new FileInputStream("input.mrc")
+ *      MarcReader reader = new MarcStreamReader(input);
+ *              
+ *      MarcWriter writer = new MarcXmlWriter(System.out, true);
+ *      while (reader.hasNext()) {
+ *          Record record = reader.next();
+ *          writer.write(record);
+ *      }
+ *      writer.close();
+ *   
+ * 
+ * + *

+ * To perform a character conversion like MARC-8 to UCS/Unicode register a + * CharConverter: + *

+ * + *
+ * writer.setConverter(new AnselToUnicode());
+ * 
+ * + *

+ * In addition you can perform Unicode normalization. This is for example not + * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text + * is transformed into the canonical composed form. For example "a�bc" + * is normalized to "�bc". To perform normalization set Unicode + * normalization to true: + *

+ * + *
+ * writer.setUnicodeNormalization(true);
+ * 
+ * + *

+ * Please note that it's not garanteed to work if you try to convert normalized + * Unicode back to MARC-8 encoding using + * {@link org.marc4j.converter.impl.UnicodeToAnsel}. + *

+ *

+ * This class provides very basic formatting options. For more advanced options + * create an instance of this class with a + * {@link javax.xml.transform.sax.SAXResult} containing a + * {@link org.xml.sax.ContentHandler} derived from a dedicated XML + * serializer. + *

+ * + *

+ * The following example uses + * org.apache.xml.serialize.XMLSerializer to write MARC records + * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization: + *

+ * + *
+ *  
+ *      InputStream input = new FileInputStream("input.mrc")
+ *      MarcReader reader = new MarcStreamReader(input);
+ *                
+ *      OutputFormat format = new OutputFormat("xml","UTF-8", true);
+ *      OutputStream out = new FileOutputStream("output.xml");
+ *      XMLSerializer serializer = new XMLSerializer(out, format);
+ *      Result result = new SAXResult(serializer.asContentHandler());
+ *                
+ *      MarcXmlWriter writer = new MarcXmlWriter(result);
+ *      writer.setConverter(new AnselToUnicode());
+ *      while (reader.hasNext()) {
+ *          Record record = reader.next();
+ *          writer.write(record);
+ *      }
+ *      writer.close();
+ *   
+ * 
+ * + *

+ * You can post-process the result using a Source object pointing + * to a stylesheet resource and a Result object to hold the + * transformation result tree. The example below converts MARC to MARCXML and + * transforms the result tree to MODS using the stylesheet provided by The + * Library of Congress: + *

+ * + *
+ *  
+ *      String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl";
+ *      Source stylesheet = new StreamSource(stylesheetUrl);
+ *         
+ *      Result result = new StreamResult(System.out);
+ *            
+ *      InputStream input = new FileInputStream("input.mrc")
+ *      MarcReader reader = new MarcStreamReader(input);
+ *      MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);
+ *      writer.setConverter(new AnselToUnicode());
+ *      while (reader.hasNext()) {
+ *          Record record = (Record) reader.next();
+ *          writer.write(record);
+ *      }
+ *      writer.close();
+ *   
+ * 
+ * + *

+ * It is also possible to write the result into a DOM Node: + *

+ * + *
+ *  
+ *      InputStream input = new FileInputStream("input.mrc")
+ *      MarcReader reader = new MarcStreamReader(input);
+ *      DOMResult result = new DOMResult();
+ *      MarcXmlWriter writer = new MarcXmlWriter(result);
+ *      writer.setConverter(new AnselToUnicode());
+ *      while (reader.hasNext()) {
+ *          Record record = (Record) reader.next();
+ *          writer.write(record);
+ *      }
+ *      writer.close();
+ *         
+ *      Document doc = (Document) result.getNode();
+ *   
+ * 
+ * + * @author Bas Peters + * @version $Revision: 1.9 $ + * + */ +public class TurboMarcXmlWriter implements MarcWriter { + + protected static final String CONTROL_FIELD = "c"; + + protected static final String DATA_FIELD = "d"; + + protected static final String SUBFIELD = "s"; + + protected static final String COLLECTION = "c"; + + protected static final String RECORD = "r"; + + protected static final String LEADER = "l"; + + private boolean indent = false; + + private TransformerHandler handler = null; + + private Writer writer = null; + + + /** + * Character encoding. Default is UTF-8. + */ + //private String encoding = "UTF8"; + + private CharConverter converter = null; + + private boolean normalize = false; + + /** + * Constructs an instance with the specified output stream. + * + * The default character encoding for UTF-8 is used. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out) { + this(out, false); + } + + /** + * Constructs an instance with the specified output stream and indentation. + * + * The default character encoding for UTF-8 is used. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, boolean indent) { + this(out, "UTF8", indent); + } + + /** + * Constructs an instance with the specified output stream and character + * encoding. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, String encoding) { + this(out, encoding, false); + } + + /** + * Constructs an instance with the specified output stream, character + * encoding and indentation. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, String encoding, boolean indent) { + if (out == null) { + throw new NullPointerException("null OutputStream"); + } + if (encoding == null) { + throw new NullPointerException("null encoding"); + } + try { + setIndent(indent); + writer = new OutputStreamWriter(out, encoding); + writer = new BufferedWriter(writer); + // this.encoding = encoding; + setHandler(new StreamResult(writer), null); + } catch (UnsupportedEncodingException e) { + throw new MarcException(e.getMessage(), e); + } + writeStartDocument(); + } + + /** + * Constructs an instance with the specified result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result) { + if (result == null) + throw new NullPointerException("null Result"); + setHandler(result, null); + writeStartDocument(); + } + + /** + * Constructs an instance with the specified stylesheet location and result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result, String stylesheetUrl) { + this(result, new StreamSource(stylesheetUrl)); + } + + /** + * Constructs an instance with the specified stylesheet source and result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result, Source stylesheet) { + if (stylesheet == null) + throw new NullPointerException("null Source"); + if (result == null) + throw new NullPointerException("null Result"); + setHandler(result, stylesheet); + writeStartDocument(); + } + + public void close() { + writeEndDocument(); + try { + writer.close(); + } catch (IOException e) { + throw new MarcException(e.getMessage(), e); + } + } + + /** + * Returns the character converter. + * + * @return CharConverter the character converter + */ + public CharConverter getConverter() { + return converter; + } + + /** + * Sets the character converter. + * + * @param converter + * the character converter + */ + public void setConverter(CharConverter converter) { + this.converter = converter; + } + + /** + * If set to true this writer will perform Unicode normalization on data + * elements using normalization form C (NFC). The default is false. + * + * The implementation used is ICU4J 2.6. This version is based on Unicode + * 4.0. + * + * @param normalize + * true if this writer performs Unicode normalization, false + * otherwise + */ + public void setUnicodeNormalization(boolean normalize) { + this.normalize = normalize; + } + + /** + * Returns true if this writer will perform Unicode normalization, false + * otherwise. + * + * @return boolean - true if this writer performs Unicode normalization, + * false otherwise. + */ + public boolean getUnicodeNormalization() { + return normalize; + } + + protected void setHandler(Result result, Source stylesheet) + throws MarcException { + try { + TransformerFactory factory = TransformerFactory.newInstance(); + if (!factory.getFeature(SAXTransformerFactory.FEATURE)) + throw new UnsupportedOperationException( + "SAXTransformerFactory is not supported"); + + SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory; + if (stylesheet == null) + handler = saxFactory.newTransformerHandler(); + else + handler = saxFactory.newTransformerHandler(stylesheet); + handler.getTransformer() + .setOutputProperty(OutputKeys.METHOD, "xml"); + handler.setResult(result); + + } catch (Exception e) { + throw new MarcException(e.getMessage(), e); + } + } + + /** + * Writes the root start tag to the result. + * + * @throws SAXException + */ + protected void writeStartDocument() { + try { + AttributesImpl atts = new AttributesImpl(); + handler.startDocument(); + // The next line duplicates the namespace declaration for Marc XML + // handler.startPrefixMapping("", Constants.MARCXML_NS_URI); + // add namespace declaration using attribute - need better solution + atts.addAttribute(Constants.TURBO_MARCXML_NS_URI, "xmlns", "xmlns", + "CDATA", Constants.TURBO_MARCXML_NS_URI); + handler.startElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, COLLECTION, atts); + } catch (SAXException e) { + throw new MarcException( + "SAX error occured while writing start document", e); + } + } + + /** + * Writes the root end tag to the result. + * + * @throws SAXException + */ + protected void writeEndDocument() { + try { + if (indent) + handler.ignorableWhitespace("\n".toCharArray(), 0, 1); + + handler + .endElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, + COLLECTION); + handler.endPrefixMapping(""); + handler.endDocument(); + } catch (SAXException e) { + throw new MarcException( + "SAX error occured while writing end document", e); + } + } + + /** + * Writes a Record object to the result. + * + * @param record - + * the Record object + * @throws SAXException + */ + public void write(Record record) { + try { + toXml(record); + } catch (SAXException e) { + throw new MarcException("SAX error occured while writing record", e); + } + } + + /** + * Returns true if indentation is active, false otherwise. + * + * @return boolean + */ + public boolean hasIndent() { + return indent; + } + + /** + * Activates or deactivates indentation. Default value is false. + * + * @param indent + */ + public void setIndent(boolean indent) { + this.indent = indent; + } + + protected void toXml(Record record) throws SAXException { + char temp[]; + AttributesImpl atts = new AttributesImpl(); + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 3); + + handler.startElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD, atts); + + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); + + handler.startElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER, atts); + Leader leader = record.getLeader(); + temp = leader.toString().toCharArray(); + handler.characters(temp, 0, temp.length); + handler.endElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER); + + Iterator ci = record.getControlFields().iterator(); + while (ci.hasNext()) { + ControlField field = (ControlField) ci.next(); + atts = new AttributesImpl(); + //atts.addAttribute("", "tag", "tag", "CDATA", field.getTag()); + + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); + String elementName = CONTROL_FIELD + field.getTag(); + handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName, atts); + temp = getDataElement(field.getData()); + handler.characters(temp, 0, temp.length); + handler.endElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName); + } + + Iterator di = record.getDataFields().iterator(); + while (di.hasNext()) { + DataField field = di.next(); + atts = new AttributesImpl(); + // atts.addAttribute("", "tag", "tag", "CDATA", field.getTag()); + atts.addAttribute("", "ind1", "ind1", "CDATA", String.valueOf(field + .getIndicator1())); + atts.addAttribute("", "ind2", "ind2", "CDATA", String.valueOf(field + .getIndicator2())); + + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); + StringBuffer elementName = new StringBuffer(DATA_FIELD); + elementName.append(field.getTag()); + handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString(), atts); + + Iterator si = field.getSubfields().iterator(); + while (si.hasNext()) { + Subfield subfield = (Subfield) si.next(); + StringBuffer subfieldName = new StringBuffer(SUBFIELD); + + char code = subfield.getCode(); + // if [a-zA-Z0-9] append to elementName, otherwise use a attribute + if (code >= '0' && code <= '9' || + code >= 'a' && code <= 'z' || + code >= 'A' && code <= 'Z') { + subfieldName.append(code); + } + else { + atts = new AttributesImpl(); + atts.addAttribute("", "code", "code", "CDATA", String + .valueOf(subfield.getCode())); + } + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 7); + + handler.startElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(), + subfieldName.toString(), atts); + temp = getDataElement(subfield.getData()); + handler.characters(temp, 0, temp.length); + handler + .endElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(), + subfieldName.toString()); + } + + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); + + handler + .endElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString()); + } + + if (indent) + handler.ignorableWhitespace("\n ".toCharArray(), 0, 3); + + handler.endElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD); + } + + protected char[] getDataElement(String data) { + String dataElement = null; + if (converter == null) + return data.toCharArray(); + dataElement = converter.convert(data); + if (normalize) + dataElement = Normalizer.normalize(dataElement, Normalizer.NFC); + return dataElement.toCharArray(); + } +} \ No newline at end of file -- 1.7.10.4