Skip to content

Commit

Permalink
issue #113: Reading MARCMaker format
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Dec 10, 2021
1 parent ec985c1 commit 4dcd4d6
Show file tree
Hide file tree
Showing 11 changed files with 538 additions and 8 deletions.
32 changes: 32 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/MarcFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import de.gwdg.metadataqa.marc.utils.alephseq.AlephseqLine;
import de.gwdg.metadataqa.marc.utils.MapToDatafield;

import de.gwdg.metadataqa.marc.utils.alephseq.MarcMakerLine;
import de.gwdg.metadataqa.marc.utils.alephseq.MarclineLine;
import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition;
import de.gwdg.metadataqa.marc.utils.pica.PicaLine;
Expand Down Expand Up @@ -427,6 +428,37 @@ public static Record createRecordFromMarcline(List<MarclineLine> lines) {
return marc4jRecord;
}

public static Record createRecordFromMarcMaker(List<MarcMakerLine> lines) {
Record marc4jRecord = new RecordImpl();
for (MarcMakerLine line : lines) {
if (line.isLeader()) {
try {
marc4jRecord.setLeader(new LeaderImpl(line.getContent()));
} catch (StringIndexOutOfBoundsException e) {
logger.severe("Error at creating leader: " + e.getMessage());
}
} else if (line.isNumericTag()) {
if (line.isControlField()) {
marc4jRecord.addVariableField(new ControlFieldImpl(line.getTag(), line.getContent()));
} else {
var df = new DataFieldImpl(line.getTag(), line.getInd1().charAt(0), line.getInd2().charAt(0));
for (String[] pair : line.parseSubfields()) {
if (pair.length == 2 && pair[0] != null && pair[1] != null) {
df.addSubfield(new SubfieldImpl(pair[0].charAt(0), pair[1]));
} else {
logger.warning(String.format(
"parse error in record #%s) tag %s: '%s'",
line.getRecordID(), line.getTag(), line.getRawContent()
));
}
}
marc4jRecord.addVariableField(df);
}
}
}
return marc4jRecord;
}

public static Record createRecordFromPica(List<PicaLine> lines) {
Record marc4jRecord = new RecordImpl();
String id = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ public enum MarcFormat {
XML("XML", "MARCXML"),
ALEPHSEQ("ALEPHSEQ", "ALEPHSEQ"),
LINE_SEPARATED("LINE_SEPARATED", "Line separated binary MARC (each line contains one record)"),
MARC_LINE("MARC_LINE", "MARC Line")
;
MARC_LINE("MARC_LINE", "MARC Line"),
MARC_MAKER("MARC_MAKER", "MARCMaker");

String code;
String label;
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/utils/ReadMarc.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ public static MarcReader getMarclineStreamReader(InputStream stream) throws Exce
return new MarclineReader(stream);
}

public static MarcReader getMarcMakerFileReader(String fileName) throws Exception {
return new MarclineReader(fileName);
}

public static MarcReader getMarcMakerStreamReader(InputStream stream) throws Exception {
return new MarclineReader(stream);
}

public static MarcReader getReader(String fileName, boolean isMarcxml) throws Exception {
return getReader(fileName, isMarcxml, false);
}
Expand All @@ -104,6 +112,8 @@ public static MarcReader getFileReader(MarcFormat marcFormat, String fileName, S
reader = ReadMarc.getXmlFileReader(fileName); break;
case MARC_LINE:
reader = ReadMarc.getMarclineFileReader(fileName); break;
case MARC_MAKER:
reader = ReadMarc.getMarcMakerFileReader(fileName); break;
case ISO:
default:
reader = ReadMarc.getIsoFileReader(fileName, encoding); break;
Expand All @@ -126,6 +136,8 @@ public static MarcReader getStreamReader(MarcFormat marcFormat, InputStream stre
reader = ReadMarc.getXmlStreamReader(stream); break;
case MARC_LINE:
reader = ReadMarc.getMarclineStreamReader(stream); break;
case MARC_MAKER:
reader = ReadMarc.getMarcMakerStreamReader(stream); break;
case ISO:
default:
reader = ReadMarc.getIsoStreamReader(stream, encoding); break;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package de.gwdg.metadataqa.marc.utils.alephseq;

import de.gwdg.metadataqa.marc.dao.DataField;
import org.apache.commons.lang.StringUtils;

import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Pattern;

public class MarcMakerLine {
private static final Logger logger = Logger.getLogger(MarcMakerLine.class.getCanonicalName());

private static final String LDR = "LDR";
private static final Pattern numericTag = Pattern.compile("^\\d\\d\\d$");
private static final Pattern controlField = Pattern.compile("^00\\d$");
public static final String SEPARATOR = "\\$";
private int lineNumber = 0;

private String recordID;
private String tag;
private String ind1;
private String ind2;
private String content;
private boolean valid = true;
private boolean isLeader = false;

public MarcMakerLine() {
}

public MarcMakerLine(String raw) {
parse(raw);
}

public MarcMakerLine(String raw, int lineNumber) {
this.lineNumber = lineNumber;
parse(raw);
}

public boolean isLeader() {
return isLeader;
/*
if (tag == null)
return false;
return tag.equals(LDR);
*/
}

public boolean isNumericTag() {
if (tag == null)
return false;

return numericTag.matcher(tag).matches();
}

public boolean isControlField() {
if (tag == null)
return false;

return controlField.matcher(tag).matches();
}

public boolean isValidTag() {
return (isValid() && (isLeader() || isNumericTag()));
}

public String getRecordID() {
return recordID;
}

public String getTag() {
return tag;
}

public String getInd1() {
return ind1;
}

public String getInd2() {
return ind2;
}

public String getContent() {
if (content == null)
return content;

if (isLeader() || isControlField())
return content.replace("^", " ");
else
return content.replace("$$", "$");
}

public String getRawContent() {
return content;
}

private void parse(String raw) {
if (raw.substring(0, 1).equals("=")) {
tag = raw.substring(1, 4);
// marcRecord.setField(tag, content, marcVersion);
if (tag.equals("LDR") || tag.equals("000")) {
isLeader = true;
content = raw.replaceAll("^=... ", "").replace("\\", " ").trim();
tag = null;
} else {
if (isControlField())
content = raw.substring(6).replace("\\", " ").trim();
else {
ind1 = raw.substring(6, 7).replace("\\", " ");
ind2 = raw.substring(7, 8).replace("\\", " ");
content = raw.substring(8).trim();
}
}
} else {
// continuing line
content = raw.trim();
}
}

public List<String[]> parseSubfields() {
List<String[]> subfields = new ArrayList<>();
String[] segments = content.split(SEPARATOR);
for (String segment : segments) {
if (StringUtils.isNotBlank(segment))
subfields.add(new String[]{segment.substring(0, 1), segment.substring(1)});
}
return subfields;
}

public boolean isValid() {
return valid;
}

@Override
public String toString() {
return "AlephseqLine{" +
"recordID='" + recordID + '\'' +
", tag='" + tag + '\'' +
", ind1='" + ind1 + '\'' +
", ind2='" + ind2 + '\'' +
", content='" + getContent() + '\'' +
'}';
}

public List<String[]> getSubfields() {
return DataField.parseSubfields(getContent());
}

public void appendContent(String extra) {
content += " " + extra;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package de.gwdg.metadataqa.marc.utils.marcreader;

import de.gwdg.metadataqa.marc.MarcFactory;
import de.gwdg.metadataqa.marc.utils.alephseq.MarcMakerLine;
import de.gwdg.metadataqa.marc.utils.alephseq.MarclineLine;
import org.marc4j.MarcReader;
import org.marc4j.marc.Record;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

public class MarcMakerReader implements MarcReader {

private static final Logger logger = Logger.getLogger(MarcMakerReader.class.getCanonicalName());

private enum LEVEL {
WARN, SEVERE
};

private BufferedReader bufferedReader = null;
private String line = null;
private boolean nextIsConsumed = false;
private int lineNumber = 0;
private int skippedRecords = 0;
private List<MarcMakerLine> lines = new ArrayList<>();
private String currentId = null;

public MarcMakerReader(String content) {
try {
bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(content), "UTF8"));
} catch (IOException e) {
logger.log(Level.WARNING, "MarcMakerReader", e);
}
}

public MarcMakerReader(InputStream stream) {
try {
bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF8"));
} catch (IOException e) {
logger.log(Level.WARNING, "MarcMakerReader", e);
}
}

@Override
public boolean hasNext() {
if (lineNumber == 0 || nextIsConsumed) {
try {
line = bufferedReader.readLine();
} catch (IOException e) {
logger.log(Level.WARNING, "hasNext", e);
}
lineNumber++;
nextIsConsumed = false;
}
return (line != null);
}

@Override
public Record next() {
Record marc4jRecord = null;
boolean deleted = false;
boolean finished = false;
while (line != null && !finished) {
MarcMakerLine marcMakerLine = new MarcMakerLine(line, lineNumber);
if (marcMakerLine.isLeader() && !lines.isEmpty()) {
marc4jRecord = MarcFactory.createRecordFromMarcMaker(lines);
if (marc4jRecord.getControlNumber() == null) {
logSkipped("does not have a control number field (001)");
} else if (marc4jRecord.getLeader() == null) {
logSkipped("does not have a leader");
} else {
finished = true;
}
lines = new ArrayList<>();
}

if (!marcMakerLine.isLeader() && marcMakerLine.getTag() == null) {
lines.get(lines.size() - 1).appendContent(marcMakerLine.getContent());
} else {
if (marcMakerLine.isValidTag()) {
lines.add(marcMakerLine);
}
currentId = marcMakerLine.getRecordID();
}

try {
line = bufferedReader.readLine();
lineNumber++;
} catch (IOException e) {
logger.log(Level.WARNING, "next", e);
}
}
if (line == null && !lines.isEmpty()) {
marc4jRecord = MarcFactory.createRecordFromMarcMaker(lines);
}
return marc4jRecord;
}

public int getLineNumber() {
return lineNumber;
}

public int getSkippedRecords() {
return skippedRecords;
}

private void logSkipped(String message) {
logSkipped(MarcMakerReader.LEVEL.SEVERE, message);
}

private void logSkipped(MarcMakerReader.LEVEL level, String message) {
String entry = String.format(
"line #%d: record %s %s. Skipped.",
lineNumber, currentId, message
);

if (level.equals(MarcMakerReader.LEVEL.WARN)) {
// logger.warning(entry);
} else {
logger.severe(entry);
}

skippedRecords++;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ public MarclineReader(String content) {
try {
bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(content), "UTF8"));
} catch (IOException e) {
logger.log(Level.WARNING, "AlephseqMarcReader", e);
logger.log(Level.WARNING, "MarclineReader", e);
}
}

public MarclineReader(InputStream stream) {
try {
bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF8"));
} catch (IOException e) {
logger.log(Level.WARNING, "AlephseqMarcReader", e);
logger.log(Level.WARNING, "MarclineReader", e);
}
}

Expand Down
Loading

0 comments on commit 4dcd4d6

Please sign in to comment.