X-Git-Url: http://git.indexdata.com/?p=marc4j.git;a=blobdiff_plain;f=src%2Forg%2Fmarc4j%2FTurboMarcXmlWriter.java;fp=src%2Forg%2Fmarc4j%2FTurboMarcXmlWriter.java;h=789410d05d78457a98fddb04e1971e56b510f5b0;hp=0000000000000000000000000000000000000000;hb=a2ab873d02a33623c5e110bd8ba6bddebae6765b;hpb=9b23011f94c4a16f88a421f69ef82c6f6bd3813b diff --git a/src/org/marc4j/TurboMarcXmlWriter.java b/src/org/marc4j/TurboMarcXmlWriter.java new file mode 100644 index 0000000..789410d --- /dev/null +++ b/src/org/marc4j/TurboMarcXmlWriter.java @@ -0,0 +1,572 @@ +//$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $ +/** + * Copyright (C) 2004 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.marc4j; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.util.Iterator; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Result; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import org.marc4j.converter.CharConverter; +import org.marc4j.marc.ControlField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Leader; +import org.marc4j.marc.Record; +import org.marc4j.marc.Subfield; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import com.ibm.icu.text.Normalizer; + +/** + * Class for writing MARC record objects in MARCXML format. This class outputs a + * SAX event stream to the given {@link java.io.OutputStream} or + * {@link javax.xml.transform.Result} object. It can be used in a SAX + * pipeline to postprocess the result. By default this class uses a nulll + * transform. It is strongly recommended to use a dedicated XML serializer. + * + *
+ * This class requires a JAXP compliant XML parser and XSLT processor. The + * underlying SAX2 parser should be namespace aware. In addition this class + * requires ICU4J to perform Unicode + * normalization. A stripped down version of 2.6 originating from the XOM project is included in this + * distribution. + *
+ *+ * The following example reads a file with MARC records and writes MARCXML + * records in UTF-8 encoding to the console: + *
+ * + *+ * + * InputStream input = new FileInputStream("input.mrc") + * MarcReader reader = new MarcStreamReader(input); + * + * MarcWriter writer = new MarcXmlWriter(System.out, true); + * while (reader.hasNext()) { + * Record record = reader.next(); + * writer.write(record); + * } + * writer.close(); + * + *+ * + *
+ * To perform a character conversion like MARC-8 to UCS/Unicode register a
+ * CharConverter
:
+ *
+ * writer.setConverter(new AnselToUnicode()); + *+ * + *
+ * In addition you can perform Unicode normalization. This is for example not + * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text + * is transformed into the canonical composed form. For example "a�bc" + * is normalized to "�bc". To perform normalization set Unicode + * normalization to true: + *
+ * + *+ * writer.setUnicodeNormalization(true); + *+ * + *
+ * Please note that it's not garanteed to work if you try to convert normalized + * Unicode back to MARC-8 encoding using + * {@link org.marc4j.converter.impl.UnicodeToAnsel}. + *
+ *+ * This class provides very basic formatting options. For more advanced options + * create an instance of this class with a + * {@link javax.xml.transform.sax.SAXResult} containing a + * {@link org.xml.sax.ContentHandler} derived from a dedicated XML + * serializer. + *
+ * + *
+ * The following example uses
+ * org.apache.xml.serialize.XMLSerializer
to write MARC records
+ * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization:
+ *
+ * + * InputStream input = new FileInputStream("input.mrc") + * MarcReader reader = new MarcStreamReader(input); + * + * OutputFormat format = new OutputFormat("xml","UTF-8", true); + * OutputStream out = new FileOutputStream("output.xml"); + * XMLSerializer serializer = new XMLSerializer(out, format); + * Result result = new SAXResult(serializer.asContentHandler()); + * + * MarcXmlWriter writer = new MarcXmlWriter(result); + * writer.setConverter(new AnselToUnicode()); + * while (reader.hasNext()) { + * Record record = reader.next(); + * writer.write(record); + * } + * writer.close(); + * + *+ * + *
+ * You can post-process the result using a Source
object pointing
+ * to a stylesheet resource and a Result
object to hold the
+ * transformation result tree. The example below converts MARC to MARCXML and
+ * transforms the result tree to MODS using the stylesheet provided by The
+ * Library of Congress:
+ *
+ * + * String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl"; + * Source stylesheet = new StreamSource(stylesheetUrl); + * + * Result result = new StreamResult(System.out); + * + * InputStream input = new FileInputStream("input.mrc") + * MarcReader reader = new MarcStreamReader(input); + * MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet); + * writer.setConverter(new AnselToUnicode()); + * while (reader.hasNext()) { + * Record record = (Record) reader.next(); + * writer.write(record); + * } + * writer.close(); + * + *+ * + *
+ * It is also possible to write the result into a DOM Node: + *
+ * + *+ * + * InputStream input = new FileInputStream("input.mrc") + * MarcReader reader = new MarcStreamReader(input); + * DOMResult result = new DOMResult(); + * MarcXmlWriter writer = new MarcXmlWriter(result); + * writer.setConverter(new AnselToUnicode()); + * while (reader.hasNext()) { + * Record record = (Record) reader.next(); + * writer.write(record); + * } + * writer.close(); + * + * Document doc = (Document) result.getNode(); + * + *+ * + * @author Bas Peters + * @version $Revision: 1.9 $ + * + */ +public class TurboMarcXmlWriter implements MarcWriter { + + protected static final String CONTROL_FIELD = "c"; + + protected static final String DATA_FIELD = "d"; + + protected static final String SUBFIELD = "s"; + + protected static final String COLLECTION = "c"; + + protected static final String RECORD = "r"; + + protected static final String LEADER = "l"; + + private boolean indent = false; + + private TransformerHandler handler = null; + + private Writer writer = null; + + + /** + * Character encoding. Default is UTF-8. + */ + //private String encoding = "UTF8"; + + private CharConverter converter = null; + + private boolean normalize = false; + + /** + * Constructs an instance with the specified output stream. + * + * The default character encoding for UTF-8 is used. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out) { + this(out, false); + } + + /** + * Constructs an instance with the specified output stream and indentation. + * + * The default character encoding for UTF-8 is used. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, boolean indent) { + this(out, "UTF8", indent); + } + + /** + * Constructs an instance with the specified output stream and character + * encoding. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, String encoding) { + this(out, encoding, false); + } + + /** + * Constructs an instance with the specified output stream, character + * encoding and indentation. + * + * @throws MarcException + */ + public TurboMarcXmlWriter(OutputStream out, String encoding, boolean indent) { + if (out == null) { + throw new NullPointerException("null OutputStream"); + } + if (encoding == null) { + throw new NullPointerException("null encoding"); + } + try { + setIndent(indent); + writer = new OutputStreamWriter(out, encoding); + writer = new BufferedWriter(writer); + // this.encoding = encoding; + setHandler(new StreamResult(writer), null); + } catch (UnsupportedEncodingException e) { + throw new MarcException(e.getMessage(), e); + } + writeStartDocument(); + } + + /** + * Constructs an instance with the specified result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result) { + if (result == null) + throw new NullPointerException("null Result"); + setHandler(result, null); + writeStartDocument(); + } + + /** + * Constructs an instance with the specified stylesheet location and result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result, String stylesheetUrl) { + this(result, new StreamSource(stylesheetUrl)); + } + + /** + * Constructs an instance with the specified stylesheet source and result. + * + * @param result + * @throws SAXException + */ + public TurboMarcXmlWriter(Result result, Source stylesheet) { + if (stylesheet == null) + throw new NullPointerException("null Source"); + if (result == null) + throw new NullPointerException("null Result"); + setHandler(result, stylesheet); + writeStartDocument(); + } + + public void close() { + writeEndDocument(); + try { + writer.close(); + } catch (IOException e) { + throw new MarcException(e.getMessage(), e); + } + } + + /** + * Returns the character converter. + * + * @return CharConverter the character converter + */ + public CharConverter getConverter() { + return converter; + } + + /** + * Sets the character converter. + * + * @param converter + * the character converter + */ + public void setConverter(CharConverter converter) { + this.converter = converter; + } + + /** + * If set to true this writer will perform Unicode normalization on data + * elements using normalization form C (NFC). The default is false. + * + * The implementation used is ICU4J 2.6. This version is based on Unicode + * 4.0. + * + * @param normalize + * true if this writer performs Unicode normalization, false + * otherwise + */ + public void setUnicodeNormalization(boolean normalize) { + this.normalize = normalize; + } + + /** + * Returns true if this writer will perform Unicode normalization, false + * otherwise. + * + * @return boolean - true if this writer performs Unicode normalization, + * false otherwise. + */ + public boolean getUnicodeNormalization() { + return normalize; + } + + protected void setHandler(Result result, Source stylesheet) + throws MarcException { + try { + TransformerFactory factory = TransformerFactory.newInstance(); + if (!factory.getFeature(SAXTransformerFactory.FEATURE)) + throw new UnsupportedOperationException( + "SAXTransformerFactory is not supported"); + + SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory; + if (stylesheet == null) + handler = saxFactory.newTransformerHandler(); + else + handler = saxFactory.newTransformerHandler(stylesheet); + handler.getTransformer() + .setOutputProperty(OutputKeys.METHOD, "xml"); + handler.setResult(result); + + } catch (Exception e) { + throw new MarcException(e.getMessage(), e); + } + } + + /** + * Writes the root start tag to the result. + * + * @throws SAXException + */ + protected void writeStartDocument() { + try { + AttributesImpl atts = new AttributesImpl(); + handler.startDocument(); + // The next line duplicates the namespace declaration for Marc XML + // handler.startPrefixMapping("", Constants.MARCXML_NS_URI); + // add namespace declaration using attribute - need better solution + atts.addAttribute(Constants.TURBO_MARCXML_NS_URI, "xmlns", "xmlns", + "CDATA", Constants.TURBO_MARCXML_NS_URI); + handler.startElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, COLLECTION, atts); + } catch (SAXException e) { + throw new MarcException( + "SAX error occured while writing start document", e); + } + } + + /** + * Writes the root end tag to the result. + * + * @throws SAXException + */ + protected void writeEndDocument() { + try { + if (indent) + handler.ignorableWhitespace("\n".toCharArray(), 0, 1); + + handler + .endElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, + COLLECTION); + handler.endPrefixMapping(""); + handler.endDocument(); + } catch (SAXException e) { + throw new MarcException( + "SAX error occured while writing end document", e); + } + } + + /** + * Writes a Record object to the result. + * + * @param record - + * the
Record
object
+ * @throws SAXException
+ */
+ public void write(Record record) {
+ try {
+ toXml(record);
+ } catch (SAXException e) {
+ throw new MarcException("SAX error occured while writing record", e);
+ }
+ }
+
+ /**
+ * Returns true if indentation is active, false otherwise.
+ *
+ * @return boolean
+ */
+ public boolean hasIndent() {
+ return indent;
+ }
+
+ /**
+ * Activates or deactivates indentation. Default value is false.
+ *
+ * @param indent
+ */
+ public void setIndent(boolean indent) {
+ this.indent = indent;
+ }
+
+ protected void toXml(Record record) throws SAXException {
+ char temp[];
+ AttributesImpl atts = new AttributesImpl();
+ if (indent)
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);
+
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD, atts);
+
+ if (indent)
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
+
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER, atts);
+ Leader leader = record.getLeader();
+ temp = leader.toString().toCharArray();
+ handler.characters(temp, 0, temp.length);
+ handler.endElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER);
+
+ Iterator