1 // $Id: MarcStreamReader.java,v 1.11 2008/09/26 21:17:42 haschart Exp $
\r
3 * Copyright (C) 2004 Bas Peters
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
23 import java.io.BufferedInputStream;
\r
24 import java.io.ByteArrayInputStream;
\r
25 import java.io.DataInputStream;
\r
26 import java.io.EOFException;
\r
27 import java.io.IOException;
\r
28 import java.io.InputStream;
\r
29 import java.io.InputStreamReader;
\r
30 import java.io.UnsupportedEncodingException;
\r
32 import org.marc4j.converter.CharConverter;
\r
33 import org.marc4j.converter.impl.AnselToUnicode;
\r
34 import org.marc4j.marc.ControlField;
\r
35 import org.marc4j.marc.DataField;
\r
36 import org.marc4j.marc.Leader;
\r
37 import org.marc4j.marc.MarcFactory;
\r
38 import org.marc4j.marc.Record;
\r
39 import org.marc4j.marc.Subfield;
\r
40 import org.marc4j.marc.impl.Verifier;
\r
43 * An iterator over a collection of MARC records in ISO 2709 format.
\r
48 * InputStream input = new FileInputStream("file.mrc");
\r
49 * MarcReader reader = new MarcStreamReader(input);
\r
50 * while (reader.hasNext()) {
\r
51 * Record record = reader.next();
\r
57 * Check the {@link org.marc4j.marc} package for examples about the use of
\r
58 * the {@link org.marc4j.marc.Record} object model.
\r
62 * When no encoding is given as an constructor argument the parser tries to
\r
63 * resolve the encoding by looking at the character coding scheme (leader
\r
64 * position 9) in MARC21 records. For UNIMARC records this position is not
\r
68 * @author Bas Peters
\r
69 * @version $Revision: 1.11 $
\r
72 public class MarcStreamReader implements MarcReader {
\r
74 private DataInputStream input = null;
\r
76 private Record record;
\r
78 private MarcFactory factory;
\r
80 private String encoding = "ISO8859_1";
\r
82 private boolean override = false;
\r
84 private CharConverter converterAnsel = null;
\r
86 private boolean setBadIndicators = true;
\r
89 * Constructs an instance with the specified input stream.
\r
91 public MarcStreamReader(InputStream input) {
\r
96 * Constructs an instance with the specified input stream.
\r
98 public MarcStreamReader(InputStream input, String encoding) {
\r
99 this.input = new DataInputStream(new BufferedInputStream(input));
\r
100 factory = MarcFactory.newInstance();
\r
101 if (encoding != null) {
\r
102 this.encoding = encoding;
\r
108 * Returns true if the iteration has more records, false otherwise.
\r
110 public boolean hasNext() {
\r
112 if (input.available() == 0)
\r
114 } catch (IOException e) {
\r
115 throw new MarcException(e.getMessage(), e);
\r
121 * Returns the next record in the iteration.
\r
123 * @return Record - the record object
\r
125 public Record next()
\r
127 record = factory.newRecord();
\r
131 byte[] byteArray = new byte[24];
\r
132 input.readFully(byteArray);
\r
134 int recordLength = parseRecordLength(byteArray);
\r
135 byte[] recordBuf = new byte[recordLength - 24];
\r
136 input.readFully(recordBuf);
\r
137 parseRecord(record, byteArray, recordBuf, recordLength);
\r
140 catch (EOFException e) {
\r
141 throw new MarcException("Premature end of file encountered", e);
\r
143 catch (IOException e) {
\r
144 throw new MarcException("an error occured reading input", e);
\r
148 private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
\r
151 ldr = factory.newLeader();
\r
152 ldr.setRecordLength(recordLength);
\r
153 int directoryLength=0;
\r
156 parseLeader(ldr, byteArray);
\r
157 directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
\r
159 catch (IOException e) {
\r
160 throw new MarcException("error parsing leader with data: "
\r
161 + new String(byteArray), e);
\r
163 catch (MarcException e) {
\r
164 throw new MarcException("error parsing leader with data: "
\r
165 + new String(byteArray), e);
\r
168 // if MARC 21 then check encoding
\r
169 switch (ldr.getCharCodingScheme()) {
\r
172 encoding = "ISO-8859-1";
\r
178 record.setLeader(ldr);
\r
180 if ((directoryLength % 12) != 0)
\r
182 throw new MarcException("invalid directory");
\r
184 DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
\r
185 int size = directoryLength / 12;
\r
187 String[] tags = new String[size];
\r
188 int[] lengths = new int[size];
\r
190 byte[] tag = new byte[3];
\r
191 byte[] length = new byte[4];
\r
192 byte[] start = new byte[5];
\r
197 for (int i = 0; i < size; i++)
\r
199 inputrec.readFully(tag);
\r
200 tmp = new String(tag);
\r
203 inputrec.readFully(length);
\r
204 tmp = new String(length);
\r
205 lengths[i] = Integer.parseInt(tmp);
\r
207 inputrec.readFully(start);
\r
210 if (inputrec.read() != Constants.FT)
\r
212 throw new MarcException("expected field terminator at end of directory");
\r
215 for (int i = 0; i < size; i++)
\r
217 //int fieldLength = getFieldLength(inputrec);
\r
218 if (Verifier.isControlField(tags[i]))
\r
220 byteArray = new byte[lengths[i] - 1];
\r
221 inputrec.readFully(byteArray);
\r
223 if (inputrec.read() != Constants.FT)
\r
225 throw new MarcException("expected field terminator at end of field");
\r
228 ControlField field = factory.newControlField();
\r
229 field.setTag(tags[i]);
\r
230 field.setData(getDataAsString(byteArray));
\r
231 record.addVariableField(field);
\r
235 byteArray = new byte[lengths[i]];
\r
236 inputrec.readFully(byteArray);
\r
239 DataField dataField = parseDataField(tags[i], byteArray);
\r
240 // dataField could be null if bad indicators
\r
241 if (dataField != null)
\r
242 record.addVariableField(dataField);
\r
243 } catch (IOException e) {
\r
244 throw new MarcException(
\r
245 "error parsing data field for tag: " + tags[i]
\r
247 + new String(byteArray), e);
\r
252 if (inputrec.read() != Constants.RT)
\r
254 throw new MarcException("expected record terminator");
\r
257 catch (IOException e)
\r
259 throw new MarcException("an error occured reading input", e);
\r
263 private DataField parseDataField(String tag, byte[] field)
\r
264 throws IOException {
\r
265 ByteArrayInputStream bais = new ByteArrayInputStream(field);
\r
266 char ind1 = (char) bais.read();
\r
267 char ind2 = (char) bais.read();
\r
269 DataField dataField = factory.newDataField();
\r
270 dataField.setTag(tag);
\r
272 boolean badIndicatorFound = false;
\r
273 if (setBadIndicators || ind1 >= ' ' )
\r
274 dataField.setIndicator1(ind1);
\r
276 badIndicatorFound = true;
\r
277 if (setBadIndicators || ind2 >= ' ')
\r
278 dataField.setIndicator2(ind2);
\r
280 badIndicatorFound = true;
\r
287 readByte = bais.read();
\r
290 switch (readByte) {
\r
292 code = bais.read();
\r
294 throw new IOException("unexpected end of data field");
\r
295 if (code == Constants.FT)
\r
297 size = getSubfieldLength(bais);
\r
298 data = new byte[size];
\r
300 subfield = factory.newSubfield();
\r
301 subfield.setCode((char) code);
\r
302 subfield.setData(getDataAsString(data));
\r
303 dataField.addSubfield(subfield);
\r
309 /* Bad Indicators was found, so dropping field */
\r
310 if (badIndicatorFound)
\r
315 @SuppressWarnings("unused")
\r
316 private int getFieldLength(DataInputStream bais) throws IOException
\r
321 switch (bais.read()) {
\r
327 throw new IOException("Field not terminated");
\r
335 private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
\r
339 switch (bais.read()) {
\r
346 throw new IOException("subfield not terminated");
\r
353 private int parseRecordLength(byte[] leaderData) throws IOException {
\r
354 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
357 char[] tmp = new char[5];
\r
360 length = Integer.parseInt(new String(tmp));
\r
361 } catch (NumberFormatException e) {
\r
362 throw new MarcException("unable to parse record length", e);
\r
367 private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
\r
368 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
370 char[] tmp = new char[5];
\r
372 // Skip over bytes for record length, If we get here, its already been computed.
\r
373 ldr.setRecordStatus((char) isr.read());
\r
374 ldr.setTypeOfRecord((char) isr.read());
\r
377 ldr.setImplDefined1(tmp);
\r
378 ldr.setCharCodingScheme((char) isr.read());
\r
379 char indicatorCount = (char) isr.read();
\r
380 char subfieldCodeLength = (char) isr.read();
\r
381 char baseAddr[] = new char[5];
\r
382 isr.read(baseAddr);
\r
385 ldr.setImplDefined2(tmp);
\r
388 ldr.setEntryMap(tmp);
\r
391 ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
\r
392 } catch (NumberFormatException e) {
\r
393 throw new MarcException("unable to parse indicator count", e);
\r
396 ldr.setSubfieldCodeLength(Integer.parseInt(String
\r
397 .valueOf(subfieldCodeLength)));
\r
398 } catch (NumberFormatException e) {
\r
399 throw new MarcException("unable to parse subfield code length", e);
\r
402 ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
\r
403 } catch (NumberFormatException e) {
\r
404 throw new MarcException("unable to parse base address of data", e);
\r
409 private String getDataAsString(byte[] bytes)
\r
411 String dataElement = null;
\r
412 if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
\r
415 dataElement = new String(bytes, "UTF8");
\r
417 catch (UnsupportedEncodingException e) {
\r
418 throw new MarcException("unsupported encoding", e);
\r
421 else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
\r
423 if (converterAnsel == null) converterAnsel = new AnselToUnicode();
\r
425 for (int index = 0; index < bytes.length; index++)
\r
426 if (bytes[index] < 32)
\r
427 bytes[index] = ' ';
\r
428 dataElement = converterAnsel.convert(bytes);
\r
429 //dataElement = dataElement.replaceAll("\0", " ");
\r
431 else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
\r
434 dataElement = new String(bytes, "ISO-8859-1");
\r
436 catch (UnsupportedEncodingException e) {
\r
437 throw new MarcException("unsupported encoding", e);
\r
440 return dataElement;
\r
443 public boolean isBadIndicators() {
\r
444 return setBadIndicators;
\r
447 public void setBadIndicators(boolean trueFalse) {
\r
448 this.setBadIndicators = trueFalse;
\r