1 // $Id: MarcStreamReader.java,v 1.11 2008/09/26 21:17:42 haschart Exp $
\r
3 * Copyright (C) 2004 Bas Peters
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
23 import java.io.BufferedInputStream;
\r
24 import java.io.ByteArrayInputStream;
\r
25 import java.io.DataInputStream;
\r
26 import java.io.EOFException;
\r
27 import java.io.IOException;
\r
28 import java.io.InputStream;
\r
29 import java.io.InputStreamReader;
\r
30 import java.io.UnsupportedEncodingException;
\r
32 import org.marc4j.converter.CharConverter;
\r
33 import org.marc4j.converter.impl.AnselToUnicode;
\r
34 import org.marc4j.marc.ControlField;
\r
35 import org.marc4j.marc.DataField;
\r
36 import org.marc4j.marc.Leader;
\r
37 import org.marc4j.marc.MarcFactory;
\r
38 import org.marc4j.marc.Record;
\r
39 import org.marc4j.marc.Subfield;
\r
40 import org.marc4j.marc.impl.Verifier;
\r
43 * An iterator over a collection of MARC records in ISO 2709 format.
\r
48 * InputStream input = new FileInputStream("file.mrc");
\r
49 * MarcReader reader = new MarcStreamReader(input);
\r
50 * while (reader.hasNext()) {
\r
51 * Record record = reader.next();
\r
57 * Check the {@link org.marc4j.marc} package for examples about the use of
\r
58 * the {@link org.marc4j.marc.Record} object model.
\r
62 * When no encoding is given as an constructor argument the parser tries to
\r
63 * resolve the encoding by looking at the character coding scheme (leader
\r
64 * position 9) in MARC21 records. For UNIMARC records this position is not
\r
68 * @author Bas Peters
\r
69 * @version $Revision: 1.11 $
\r
72 public class MarcStreamReader implements MarcReader {
\r
74 private DataInputStream input = null;
\r
76 private Record record;
\r
78 private MarcFactory factory;
\r
80 private String encoding = "ISO8859_1";
\r
82 private boolean override = false;
\r
84 private CharConverter converterAnsel = null;
\r
87 * Constructs an instance with the specified input stream.
\r
89 public MarcStreamReader(InputStream input) {
\r
94 * Constructs an instance with the specified input stream.
\r
96 public MarcStreamReader(InputStream input, String encoding) {
\r
97 this.input = new DataInputStream(new BufferedInputStream(input));
\r
98 factory = MarcFactory.newInstance();
\r
99 if (encoding != null) {
\r
100 this.encoding = encoding;
\r
106 * Returns true if the iteration has more records, false otherwise.
\r
108 public boolean hasNext() {
\r
110 if (input.available() == 0)
\r
112 } catch (IOException e) {
\r
113 throw new MarcException(e.getMessage(), e);
\r
119 * Returns the next record in the iteration.
\r
121 * @return Record - the record object
\r
123 public Record next()
\r
125 record = factory.newRecord();
\r
129 byte[] byteArray = new byte[24];
\r
130 input.readFully(byteArray);
\r
132 int recordLength = parseRecordLength(byteArray);
\r
133 byte[] recordBuf = new byte[recordLength - 24];
\r
134 input.readFully(recordBuf);
\r
135 parseRecord(record, byteArray, recordBuf, recordLength);
\r
138 catch (EOFException e) {
\r
139 throw new MarcException("Premature end of file encountered", e);
\r
141 catch (IOException e) {
\r
142 throw new MarcException("an error occured reading input", e);
\r
146 private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
\r
149 ldr = factory.newLeader();
\r
150 ldr.setRecordLength(recordLength);
\r
151 int directoryLength=0;
\r
154 parseLeader(ldr, byteArray);
\r
155 directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
\r
157 catch (IOException e) {
\r
158 throw new MarcException("error parsing leader with data: "
\r
159 + new String(byteArray), e);
\r
161 catch (MarcException e) {
\r
162 throw new MarcException("error parsing leader with data: "
\r
163 + new String(byteArray), e);
\r
166 // if MARC 21 then check encoding
\r
167 switch (ldr.getCharCodingScheme()) {
\r
170 encoding = "ISO-8859-1";
\r
176 record.setLeader(ldr);
\r
178 if ((directoryLength % 12) != 0)
\r
180 throw new MarcException("invalid directory");
\r
182 DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
\r
183 int size = directoryLength / 12;
\r
185 String[] tags = new String[size];
\r
186 int[] lengths = new int[size];
\r
188 byte[] tag = new byte[3];
\r
189 byte[] length = new byte[4];
\r
190 byte[] start = new byte[5];
\r
195 for (int i = 0; i < size; i++)
\r
197 inputrec.readFully(tag);
\r
198 tmp = new String(tag);
\r
201 inputrec.readFully(length);
\r
202 tmp = new String(length);
\r
203 lengths[i] = Integer.parseInt(tmp);
\r
205 inputrec.readFully(start);
\r
208 if (inputrec.read() != Constants.FT)
\r
210 throw new MarcException("expected field terminator at end of directory");
\r
213 for (int i = 0; i < size; i++)
\r
215 //int fieldLength = getFieldLength(inputrec);
\r
216 if (Verifier.isControlField(tags[i]))
\r
218 byteArray = new byte[lengths[i] - 1];
\r
219 inputrec.readFully(byteArray);
\r
221 if (inputrec.read() != Constants.FT)
\r
223 throw new MarcException("expected field terminator at end of field");
\r
226 ControlField field = factory.newControlField();
\r
227 field.setTag(tags[i]);
\r
228 field.setData(getDataAsString(byteArray));
\r
229 record.addVariableField(field);
\r
233 byteArray = new byte[lengths[i]];
\r
234 inputrec.readFully(byteArray);
\r
237 record.addVariableField(parseDataField(tags[i], byteArray));
\r
238 } catch (IOException e) {
\r
239 throw new MarcException(
\r
240 "error parsing data field for tag: " + tags[i]
\r
242 + new String(byteArray), e);
\r
247 if (inputrec.read() != Constants.RT)
\r
249 throw new MarcException("expected record terminator");
\r
252 catch (IOException e)
\r
254 throw new MarcException("an error occured reading input", e);
\r
258 private DataField parseDataField(String tag, byte[] field)
\r
259 throws IOException {
\r
260 ByteArrayInputStream bais = new ByteArrayInputStream(field);
\r
261 char ind1 = (char) bais.read();
\r
262 char ind2 = (char) bais.read();
\r
264 DataField dataField = factory.newDataField();
\r
265 dataField.setTag(tag);
\r
266 dataField.setIndicator1(ind1);
\r
267 dataField.setIndicator2(ind2);
\r
275 readByte = bais.read();
\r
278 switch (readByte) {
\r
280 code = bais.read();
\r
282 throw new IOException("unexpected end of data field");
\r
283 if (code == Constants.FT)
\r
285 size = getSubfieldLength(bais);
\r
286 data = new byte[size];
\r
288 subfield = factory.newSubfield();
\r
289 subfield.setCode((char) code);
\r
290 subfield.setData(getDataAsString(data));
\r
291 dataField.addSubfield(subfield);
\r
300 @SuppressWarnings("unused")
\r
301 private int getFieldLength(DataInputStream bais) throws IOException
\r
306 switch (bais.read()) {
\r
312 throw new IOException("Field not terminated");
\r
320 private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
\r
324 switch (bais.read()) {
\r
331 throw new IOException("subfield not terminated");
\r
338 private int parseRecordLength(byte[] leaderData) throws IOException {
\r
339 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
342 char[] tmp = new char[5];
\r
345 length = Integer.parseInt(new String(tmp));
\r
346 } catch (NumberFormatException e) {
\r
347 throw new MarcException("unable to parse record length", e);
\r
352 private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
\r
353 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
355 char[] tmp = new char[5];
\r
357 // Skip over bytes for record length, If we get here, its already been computed.
\r
358 ldr.setRecordStatus((char) isr.read());
\r
359 ldr.setTypeOfRecord((char) isr.read());
\r
362 ldr.setImplDefined1(tmp);
\r
363 ldr.setCharCodingScheme((char) isr.read());
\r
364 char indicatorCount = (char) isr.read();
\r
365 char subfieldCodeLength = (char) isr.read();
\r
366 char baseAddr[] = new char[5];
\r
367 isr.read(baseAddr);
\r
370 ldr.setImplDefined2(tmp);
\r
373 ldr.setEntryMap(tmp);
\r
376 ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
\r
377 } catch (NumberFormatException e) {
\r
378 throw new MarcException("unable to parse indicator count", e);
\r
381 ldr.setSubfieldCodeLength(Integer.parseInt(String
\r
382 .valueOf(subfieldCodeLength)));
\r
383 } catch (NumberFormatException e) {
\r
384 throw new MarcException("unable to parse subfield code length", e);
\r
387 ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
\r
388 } catch (NumberFormatException e) {
\r
389 throw new MarcException("unable to parse base address of data", e);
\r
394 private String getDataAsString(byte[] bytes)
\r
396 String dataElement = null;
\r
397 if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
\r
400 dataElement = new String(bytes, "UTF8");
\r
402 catch (UnsupportedEncodingException e) {
\r
403 throw new MarcException("unsupported encoding", e);
\r
406 else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
\r
408 if (converterAnsel == null) converterAnsel = new AnselToUnicode();
\r
409 dataElement = converterAnsel.convert(bytes);
\r
411 else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
\r
414 dataElement = new String(bytes, "ISO-8859-1");
\r
416 catch (UnsupportedEncodingException e) {
\r
417 throw new MarcException("unsupported encoding", e);
\r
420 return dataElement;
\r