Skip to content

Commit

Permalink
issue #100: Introducing --fixAlma parameter.
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Oct 5, 2021
1 parent 5522b71 commit ed59b26
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 20 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,13 @@ options:
* input formats:
* `-x`, `--marcxml` the input files are not binary MARC, but MARCXML files
* `-p`, `--alephseq` the input files are in Alephseq format
* `-q`, `--fixAlephseq` sometime ALEPH export contains '^' characters
instead spaces in control fields (006, 007, 008). This flag replace
them to spaces before the validation. It might occur in any input
* `-q`, `--fixAlephseq` sometimes ALEPH export contains '^' characters
instead spaces in control fields (006, 007, 008). This flag replaces
them with spaces before the validation. It might occur in any input
format.
* `-X`, `--fixAlma` sometimes Alma export contains '#' characters
instead spaces in control fields (006, 007, 008). This flag replaces
them with spaces before the validation. It might occur in any input
format.
* `-y`, `--linespearated` the input files are in line separated format
i.e. it is a text file, where each line is a distinct field, the
Expand Down
2 changes: 1 addition & 1 deletion scripts/onb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
. ./setdir.sh
NAME=onb
MARC_DIR=${BASE_INPUT_DIR}/onb
TYPE_PARAMS=" --emptyLargeCollectors"
TYPE_PARAMS=" --emptyLargeCollectors --fixAlma"
MASK=onb*.mrc

. ./common-script
Expand Down
16 changes: 8 additions & 8 deletions src/main/java/de/gwdg/metadataqa/marc/MarcFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -124,21 +124,21 @@ public static MarcRecord createFromMarc4j(Record marc4jRecord,
public static MarcRecord createFromMarc4j(Record marc4jRecord,
Leader.Type defaultType,
MarcVersion marcVersion) {
return createFromMarc4j(marc4jRecord, defaultType, marcVersion, false);
return createFromMarc4j(marc4jRecord, defaultType, marcVersion, null);
}

/**
* Create a MarcRecord object from Marc4j object
* @param marc4jRecord The Marc4j record
* @param defaultType The defauld document type
* @param marcVersion The MARC version
* @param fixAlephseq Replace ^ character to space in control fields
* @param replecementInControlFields A ^ or # character which sould be replaced with space in control fields
* @return
*/
public static MarcRecord createFromMarc4j(Record marc4jRecord,
Leader.Type defaultType,
MarcVersion marcVersion,
boolean fixAlephseq) {
String replecementInControlFields) {
var marcRecord = new MarcRecord();

if (marc4jRecord.getLeader() != null) {
Expand All @@ -154,7 +154,7 @@ public static MarcRecord createFromMarc4j(Record marc4jRecord,
}
}

importMarc4jControlFields(marc4jRecord, marcRecord, fixAlephseq);
importMarc4jControlFields(marc4jRecord, marcRecord, replecementInControlFields);

importMarc4jDataFields(marc4jRecord, marcRecord, marcVersion);

Expand All @@ -165,7 +165,7 @@ public static MarcRecord createPicaFromMarc4j(Record marc4jRecord, Map<String, P
var marcRecord = new MarcRecord();
marcRecord.setSchemaType(SchemaType.PICA);

importMarc4jControlFields(marc4jRecord, marcRecord, false);
importMarc4jControlFields(marc4jRecord, marcRecord, null);

importMarc4jDataFields(marc4jRecord, marcRecord, schemaDirectory);

Expand All @@ -174,11 +174,11 @@ public static MarcRecord createPicaFromMarc4j(Record marc4jRecord, Map<String, P

private static void importMarc4jControlFields(Record marc4jRecord,
MarcRecord marcRecord,
boolean fixAlephseq) {
String replecementInControlFields) {
for (ControlField controlField : marc4jRecord.getControlFields()) {
String data = controlField.getData();
if (fixAlephseq && isFixable(controlField.getTag()))
data = data.replace("^", " ");
if (replecementInControlFields != null && isFixable(controlField.getTag()))
data = data.replace(replecementInControlFields, " ");
switch (controlField.getTag()) {
case "001":
marcRecord.setControl001(new Control001(data)); break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class CommonParameters implements Serializable {
private String id = null;
protected Leader.Type defaultRecordType = null;
protected boolean fixAlephseq = false;
protected boolean fixAlma = false;
protected boolean alephseq = false;
protected boolean marcxml = false;
protected boolean lineSeparated = false;
Expand All @@ -51,6 +52,7 @@ protected void setOptions() {
options.addOption("i", "id", true, "the MARC identifier (content of 001)");
options.addOption("d", "defaultRecordType", true, "the default record type if the record's type is undetectable");
options.addOption("q", "fixAlephseq", false, "fix the known issues of Alephseq format");
options.addOption("X", "fixAlma", false, "fix the known issues of Alma format");
options.addOption("p", "alephseq", false, "the source is in Alephseq format");
options.addOption("x", "marcxml", false, "the source is in MARCXML format");
options.addOption("y", "lineSeparated", false, "the source is in line separated MARC format");
Expand Down Expand Up @@ -102,6 +104,8 @@ public CommonParameters(String[] arguments) throws ParseException {

fixAlephseq = cmd.hasOption("fixAlephseq");

fixAlma = cmd.hasOption("fixAlma");

setMarcxml(cmd.hasOption("marcxml"));

lineSeparated = cmd.hasOption("lineSeparated");
Expand Down Expand Up @@ -244,6 +248,23 @@ public void setFixAlephseq(boolean fixAlephseq) {
this.fixAlephseq = fixAlephseq;
}

public boolean fixAlma() {
return fixAlma;
}

public void setFixAlma(boolean fixAlma) {
this.fixAlma = fixAlma;
}

public String getReplecementInControlFields() {
if (fixAlephseq())
return "^";
else if (fixAlma())
return "#";
else
return null;
}

public boolean isAlephseq() {
return alephseq;
}
Expand Down Expand Up @@ -323,6 +344,7 @@ public String formatParameters() {
text += String.format("id: %s%n", id);
text += String.format("defaultRecordType: %s%n", defaultRecordType);
text += String.format("fixAlephseq: %s%n", fixAlephseq);
text += String.format("fixAlma: %s%n", fixAlma);
text += String.format("alephseq: %s%n", alephseq);
text += String.format("marcxml: %s%n", marcxml);
text += String.format("lineSeparated: %s%n", lineSeparated);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public static void main(String[] args) throws ParseException {
MarcReader reader = ReadMarc.getMarcStringReader(content);
Record marc4jRecord = reader.next();
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.getReplecementInControlFields());
validator.processRecord(marcRecord, 1);
return ValidationErrorFormatter
.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public class RecordIterator {
private int i = 0;
private String lastKnownId = "";
private CommonParameters parameters;
private boolean fixAlephseq;
private String replecementInControlFields;
private MarcVersion marcVersion;
private Leader.Type defaultRecordType;
private DecimalFormat decimalFormat;
Expand All @@ -51,7 +51,7 @@ public void start() {

marcVersion = parameters.getMarcVersion();
defaultRecordType = parameters.getDefaultRecordType();
fixAlephseq = parameters.fixAlephseq();
replecementInControlFields = parameters.getReplecementInControlFields();
decimalFormat = new DecimalFormat();

if (processor.getParameters().doLog())
Expand Down Expand Up @@ -151,7 +151,7 @@ private void processContent(MarcReader reader, String fileName) {

try {
processor.processRecord(marc4jRecord, i);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, defaultRecordType, marcVersion, fixAlephseq);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, defaultRecordType, marcVersion, replecementInControlFields);
try {
processor.processRecord(marcRecord, i);
} catch(Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ public void formatParameters() {
"id: null\n" +
"defaultRecordType: null\n" +
"fixAlephseq: false\n" +
"fixAlma: false\n" +
"alephseq: false\n" +
"marcxml: false\n" +
"lineSeparated: false\n" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public void testFieldValidation() {
marc4jRecord = reader.next();
assertNotNull(marc4jRecord);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, Leader.Type.BOOKS, MarcVersion.NKCR, true
marc4jRecord, Leader.Type.BOOKS, MarcVersion.NKCR, "^"
);
assertNotNull(marcRecord);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void testMarcRecordFunctions() {
if (reader.hasNext())
marc4jRecord = reader.next();
assertNotNull(marc4jRecord);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, true);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, "^");
assertNotNull(marcRecord);

assertEquals("000000002", marcRecord.getId());
Expand Down Expand Up @@ -338,7 +338,7 @@ public void testUtf8() {
while (reader.hasNext()) {
marc4jRecord = reader.next();
if (marc4jRecord.getControlNumber().equals("000000008")) {
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, true);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, "^");
assertEquals("München :", marcRecord.getDatafield("260").get(0).getSubfield("a").get(0).getValue());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public void testMarcRecordFunctions() {
if (reader.hasNext())
marc4jRecord = reader.next();
assertNotNull(marc4jRecord);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, true);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, "^");
assertNotNull(marcRecord);

assertEquals("010000011", marcRecord.getId());
Expand Down Expand Up @@ -414,4 +414,24 @@ public void testMarcRecordFunctions() {
assertEquals(1, hits.size());
assertEquals("GBV_ILN_20", hits.get(0));
}

@Test
public void testAlmaReplacement() {
Path path = null;
try {
path = FileUtils.getPath("marctxt/with-alma-character.mrctxt");
} catch (IOException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
}
MarcReader reader = new MarclineReader(path.toString());
Record marc4jRecord = null;
if (reader.hasNext())
marc4jRecord = reader.next();
assertNotNull(marc4jRecord);
MarcRecord marcRecord = MarcFactory.createFromMarc4j(marc4jRecord, Leader.Type.BOOKS, MarcVersion.GENT, "#");
assertNotNull(marcRecord);
assertEquals("tu ", marcRecord.getControl007().get(0).getContent());
}
}
7 changes: 7 additions & 0 deletions src/test/resources/marctxt/with-alma-character.mrctxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
LEADER 02191cam a2200541 4500
001 010000011
003 DE-627
005 20180502143346.0
007 tu###
008 861106s1985 xx ||||| 10| ||ger c
035 $a(DE-627)010000011

0 comments on commit ed59b26

Please sign in to comment.