-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
issue #113: Reading MARCMaker format
- Loading branch information
Showing
11 changed files
with
538 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
152 changes: 152 additions & 0 deletions
152
src/main/java/de/gwdg/metadataqa/marc/utils/alephseq/MarcMakerLine.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
package de.gwdg.metadataqa.marc.utils.alephseq; | ||
|
||
import de.gwdg.metadataqa.marc.dao.DataField; | ||
import org.apache.commons.lang.StringUtils; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.logging.Logger; | ||
import java.util.regex.Pattern; | ||
|
||
public class MarcMakerLine { | ||
private static final Logger logger = Logger.getLogger(MarcMakerLine.class.getCanonicalName()); | ||
|
||
private static final String LDR = "LDR"; | ||
private static final Pattern numericTag = Pattern.compile("^\\d\\d\\d$"); | ||
private static final Pattern controlField = Pattern.compile("^00\\d$"); | ||
public static final String SEPARATOR = "\\$"; | ||
private int lineNumber = 0; | ||
|
||
private String recordID; | ||
private String tag; | ||
private String ind1; | ||
private String ind2; | ||
private String content; | ||
private boolean valid = true; | ||
private boolean isLeader = false; | ||
|
||
public MarcMakerLine() { | ||
} | ||
|
||
public MarcMakerLine(String raw) { | ||
parse(raw); | ||
} | ||
|
||
public MarcMakerLine(String raw, int lineNumber) { | ||
this.lineNumber = lineNumber; | ||
parse(raw); | ||
} | ||
|
||
public boolean isLeader() { | ||
return isLeader; | ||
/* | ||
if (tag == null) | ||
return false; | ||
return tag.equals(LDR); | ||
*/ | ||
} | ||
|
||
public boolean isNumericTag() { | ||
if (tag == null) | ||
return false; | ||
|
||
return numericTag.matcher(tag).matches(); | ||
} | ||
|
||
public boolean isControlField() { | ||
if (tag == null) | ||
return false; | ||
|
||
return controlField.matcher(tag).matches(); | ||
} | ||
|
||
public boolean isValidTag() { | ||
return (isValid() && (isLeader() || isNumericTag())); | ||
} | ||
|
||
public String getRecordID() { | ||
return recordID; | ||
} | ||
|
||
public String getTag() { | ||
return tag; | ||
} | ||
|
||
public String getInd1() { | ||
return ind1; | ||
} | ||
|
||
public String getInd2() { | ||
return ind2; | ||
} | ||
|
||
public String getContent() { | ||
if (content == null) | ||
return content; | ||
|
||
if (isLeader() || isControlField()) | ||
return content.replace("^", " "); | ||
else | ||
return content.replace("$$", "$"); | ||
} | ||
|
||
public String getRawContent() { | ||
return content; | ||
} | ||
|
||
private void parse(String raw) { | ||
if (raw.substring(0, 1).equals("=")) { | ||
tag = raw.substring(1, 4); | ||
// marcRecord.setField(tag, content, marcVersion); | ||
if (tag.equals("LDR") || tag.equals("000")) { | ||
isLeader = true; | ||
content = raw.replaceAll("^=... ", "").replace("\\", " ").trim(); | ||
tag = null; | ||
} else { | ||
if (isControlField()) | ||
content = raw.substring(6).replace("\\", " ").trim(); | ||
else { | ||
ind1 = raw.substring(6, 7).replace("\\", " "); | ||
ind2 = raw.substring(7, 8).replace("\\", " "); | ||
content = raw.substring(8).trim(); | ||
} | ||
} | ||
} else { | ||
// continuing line | ||
content = raw.trim(); | ||
} | ||
} | ||
|
||
public List<String[]> parseSubfields() { | ||
List<String[]> subfields = new ArrayList<>(); | ||
String[] segments = content.split(SEPARATOR); | ||
for (String segment : segments) { | ||
if (StringUtils.isNotBlank(segment)) | ||
subfields.add(new String[]{segment.substring(0, 1), segment.substring(1)}); | ||
} | ||
return subfields; | ||
} | ||
|
||
public boolean isValid() { | ||
return valid; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "AlephseqLine{" + | ||
"recordID='" + recordID + '\'' + | ||
", tag='" + tag + '\'' + | ||
", ind1='" + ind1 + '\'' + | ||
", ind2='" + ind2 + '\'' + | ||
", content='" + getContent() + '\'' + | ||
'}'; | ||
} | ||
|
||
public List<String[]> getSubfields() { | ||
return DataField.parseSubfields(getContent()); | ||
} | ||
|
||
public void appendContent(String extra) { | ||
content += " " + extra; | ||
} | ||
} |
132 changes: 132 additions & 0 deletions
132
src/main/java/de/gwdg/metadataqa/marc/utils/marcreader/MarcMakerReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
package de.gwdg.metadataqa.marc.utils.marcreader; | ||
|
||
import de.gwdg.metadataqa.marc.MarcFactory; | ||
import de.gwdg.metadataqa.marc.utils.alephseq.MarcMakerLine; | ||
import de.gwdg.metadataqa.marc.utils.alephseq.MarclineLine; | ||
import org.marc4j.MarcReader; | ||
import org.marc4j.marc.Record; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
|
||
public class MarcMakerReader implements MarcReader { | ||
|
||
private static final Logger logger = Logger.getLogger(MarcMakerReader.class.getCanonicalName()); | ||
|
||
private enum LEVEL { | ||
WARN, SEVERE | ||
}; | ||
|
||
private BufferedReader bufferedReader = null; | ||
private String line = null; | ||
private boolean nextIsConsumed = false; | ||
private int lineNumber = 0; | ||
private int skippedRecords = 0; | ||
private List<MarcMakerLine> lines = new ArrayList<>(); | ||
private String currentId = null; | ||
|
||
public MarcMakerReader(String content) { | ||
try { | ||
bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(content), "UTF8")); | ||
} catch (IOException e) { | ||
logger.log(Level.WARNING, "MarcMakerReader", e); | ||
} | ||
} | ||
|
||
public MarcMakerReader(InputStream stream) { | ||
try { | ||
bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF8")); | ||
} catch (IOException e) { | ||
logger.log(Level.WARNING, "MarcMakerReader", e); | ||
} | ||
} | ||
|
||
@Override | ||
public boolean hasNext() { | ||
if (lineNumber == 0 || nextIsConsumed) { | ||
try { | ||
line = bufferedReader.readLine(); | ||
} catch (IOException e) { | ||
logger.log(Level.WARNING, "hasNext", e); | ||
} | ||
lineNumber++; | ||
nextIsConsumed = false; | ||
} | ||
return (line != null); | ||
} | ||
|
||
@Override | ||
public Record next() { | ||
Record marc4jRecord = null; | ||
boolean deleted = false; | ||
boolean finished = false; | ||
while (line != null && !finished) { | ||
MarcMakerLine marcMakerLine = new MarcMakerLine(line, lineNumber); | ||
if (marcMakerLine.isLeader() && !lines.isEmpty()) { | ||
marc4jRecord = MarcFactory.createRecordFromMarcMaker(lines); | ||
if (marc4jRecord.getControlNumber() == null) { | ||
logSkipped("does not have a control number field (001)"); | ||
} else if (marc4jRecord.getLeader() == null) { | ||
logSkipped("does not have a leader"); | ||
} else { | ||
finished = true; | ||
} | ||
lines = new ArrayList<>(); | ||
} | ||
|
||
if (!marcMakerLine.isLeader() && marcMakerLine.getTag() == null) { | ||
lines.get(lines.size() - 1).appendContent(marcMakerLine.getContent()); | ||
} else { | ||
if (marcMakerLine.isValidTag()) { | ||
lines.add(marcMakerLine); | ||
} | ||
currentId = marcMakerLine.getRecordID(); | ||
} | ||
|
||
try { | ||
line = bufferedReader.readLine(); | ||
lineNumber++; | ||
} catch (IOException e) { | ||
logger.log(Level.WARNING, "next", e); | ||
} | ||
} | ||
if (line == null && !lines.isEmpty()) { | ||
marc4jRecord = MarcFactory.createRecordFromMarcMaker(lines); | ||
} | ||
return marc4jRecord; | ||
} | ||
|
||
public int getLineNumber() { | ||
return lineNumber; | ||
} | ||
|
||
public int getSkippedRecords() { | ||
return skippedRecords; | ||
} | ||
|
||
private void logSkipped(String message) { | ||
logSkipped(MarcMakerReader.LEVEL.SEVERE, message); | ||
} | ||
|
||
private void logSkipped(MarcMakerReader.LEVEL level, String message) { | ||
String entry = String.format( | ||
"line #%d: record %s %s. Skipped.", | ||
lineNumber, currentId, message | ||
); | ||
|
||
if (level.equals(MarcMakerReader.LEVEL.WARN)) { | ||
// logger.warning(entry); | ||
} else { | ||
logger.severe(entry); | ||
} | ||
|
||
skippedRecords++; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.