Skip to content

Commit

Permalink
issue #105: Add defaultEncoding parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 16, 2021
1 parent df6863e commit 57cda5e
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 12 deletions.
11 changes: 8 additions & 3 deletions index
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ usage:
-v, --marcVersion <version> MARC version.
Possible values: MARC21 (default), OCLC, DNB, GENT, SZTE, FENNICA, UNIMARC
-i, --ignorableRecords <condition> ignore records from the analysis
-g, --defaultEncoding Default character encoding
-s, --status status information
-p, --purge delete all records from a core
-h, --help this help
Expand All @@ -46,8 +47,8 @@ if [ $# -eq 0 ]; then
show_usage
fi

GETOPT=$(getopt -o d:p:m:ws::xatr:hSpv:l:i: \
--long db:,file-path:,file-mask:,no-delete,solrFieldType:,marcxml,alephseq,trimId,defaultRecordType,help,status,purge,marcVersion:,limit:,ignorableRecords: \
GETOPT=$(getopt -o d:p:m:ws::xatr:hSpv:l:i:g: \
--long db:,file-path:,file-mask:,no-delete,solrFieldType:,marcxml,alephseq,trimId,defaultRecordType,help,status,purge,marcVersion:,limit:,ignorableRecords:,defaultEncoding: \
-n ${ME} -- "$@")
eval set -- "$GETOPT"

Expand All @@ -57,6 +58,7 @@ defaultRecordType=BOOKS
marcVersion=MARC21
limit=""
DELETE=1
defaultEncoding=""
while true ; do
case "$1" in
-d|--db) DB=$2 ; shift 2;;
Expand All @@ -71,6 +73,7 @@ while true ; do
-x|--marcxml) marcxml="--marcxml" ; shift;;
-a|--alephseq) alephseq="--alephseq" ; shift;;
-t|--trimId) trimId="--trimId" ; shift;;
-g|--defaultEncoding) defaultEncoding="--defaultEncoding $2" ; shift 2;;
-S|--status) status ; shift ;;
-p|--purge) purge_and_exit $DB ; shift ;;
-h|--help) show_usage ; shift ;;
Expand All @@ -80,6 +83,7 @@ while true ; do
done

echo "limit: $limit"
echo "defaultEncoding: $defaultEncoding"

CORE=${DB}_dev

Expand Down Expand Up @@ -112,14 +116,15 @@ running the command
$marcxml \
$alephseq \
$ignorableRecords \
$defaultEncoding \
${FILE_PATH}/${FILE_MASK}
---END
EOT

/usr/bin/java -cp $JAR de.gwdg.metadataqa.marc.cli.MarcToSolr \
--solrUrl ${SOLR_DB_URL} --solrFieldType $solrFieldType \
--defaultRecordType $defaultRecordType \
--marcVersion $marcVersion $limit $trimId $marcxml $alephseq $ignorableRecords \
--marcVersion $marcVersion $limit $trimId $marcxml $alephseq $ignorableRecords $defaultEncoding \
${FILE_PATH}/${FILE_MASK}

# echo "Start optimizing"
Expand Down
2 changes: 1 addition & 1 deletion scripts/mek.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
. ./setdir.sh
NAME=mek
MARC_DIR=${BASE_INPUT_DIR}/mek
TYPE_PARAMS="--emptyLargeCollectors"
TYPE_PARAMS="--emptyLargeCollectors --defaultEncoding MARC8"
MASK=MEKmind.mrc

. ./common-script
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class CommonParameters implements Serializable {
protected IgnorableRecords ignorableRecords = new IgnorableRecords();
protected IgnorableFields ignorableFields = new IgnorableFields();
protected InputStream stream = null;
protected String defaultEncoding = null;

protected Options options = new Options();
protected static final CommandLineParser parser = new DefaultParser();
Expand All @@ -62,6 +63,7 @@ protected void setOptions() {
options.addOption("v", "ignorableRecords", true, "ignore records from the analysis");
options.addOption("m", "marcFormat", true, "MARC format (like 'ISO' or 'MARCXML')");
options.addOption("m", "dataSource", true, "data source (file of stream)");
options.addOption("g", "defaultEncoding", true, "default character encoding");
isOptionSet = true;
}
}
Expand Down Expand Up @@ -121,6 +123,9 @@ public CommonParameters(String[] arguments) throws ParseException {
if (cmd.hasOption("ignorableRecords"))
setIgnorableRecords(cmd.getOptionValue("ignorableRecords"));

if (cmd.hasOption("defaultEncoding"))
setDefaultEncoding(cmd.getOptionValue("defaultEncoding"));

args = cmd.getArgs();
}

Expand Down Expand Up @@ -333,6 +338,14 @@ public void setStream(InputStream stream) {
this.stream = stream;
}

public String getDefaultEncoding() {
return defaultEncoding;
}

private void setDefaultEncoding(String defaultEncoding) {
this.defaultEncoding = defaultEncoding;
}

public String formatParameters() {
String text = "";
text += String.format("marcVersion: %s, %s%n", marcVersion.getCode(), marcVersion.getLabel());
Expand All @@ -352,6 +365,7 @@ public String formatParameters() {
text += String.format("trimId: %s%n", trimId);
text += String.format("ignorableFields: %s%n", ignorableFields);
text += String.format("ignorableRecords: %s%n", ignorableRecords);
text += String.format("defaultEncoding: %s%n", defaultEncoding);

return text;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,17 @@ private void processContent(MarcReader reader, String fileName) {

private MarcReader getMarcFileReader(CommonParameters parameters, Path path) throws Exception {
if (path.toString().endsWith(".gz")) {
return ReadMarc.getStreamReader(parameters.getMarcFormat(), new GZIPInputStream(new FileInputStream(path.toFile())));
return ReadMarc.getStreamReader(
parameters.getMarcFormat(),
new GZIPInputStream(new FileInputStream(path.toFile())),
parameters.getDefaultEncoding());
} else {
return ReadMarc.getFileReader(parameters.getMarcFormat(), path.toString());
return ReadMarc.getFileReader(parameters.getMarcFormat(), path.toString(), parameters.getDefaultEncoding());
}
}

private MarcReader getMarcStreamReader(CommonParameters parameters) throws Exception {
return ReadMarc.getStreamReader(parameters.getMarcFormat(), parameters.getStream());
return ReadMarc.getStreamReader(parameters.getMarcFormat(), parameters.getStream(), parameters.getDefaultEncoding());
}

private Record getNextMarc4jRecord(int i, String lastKnownId, MarcReader reader) {
Expand Down
28 changes: 24 additions & 4 deletions src/main/java/de/gwdg/metadataqa/marc/utils/ReadMarc.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
public class ReadMarc {

public static List<Record> read(String fileName) throws Exception {
return read(fileName, null);
}

public static List<Record> read(String fileName, String encoding) throws Exception {
InputStream in = new FileInputStream(fileName);
MarcReader reader = new MarcStreamReader(in);
MarcReader reader = new MarcStreamReader(in, encoding);

List<Record> records = new ArrayList<>();
while (reader.hasNext()) {
Expand All @@ -31,8 +35,16 @@ public static MarcReader getIsoFileReader(String fileName) throws Exception {
return getIsoStreamReader(new FileInputStream(fileName));
}

public static MarcReader getIsoFileReader(String fileName, String encoding) throws Exception {
return getIsoStreamReader(new FileInputStream(fileName), encoding);
}

public static MarcReader getIsoStreamReader(InputStream stream) throws Exception {
return new MarcStreamReader(stream);
return getIsoStreamReader(stream, null);
}

public static MarcReader getIsoStreamReader(InputStream stream, String encoding) throws Exception {
return new MarcStreamReader(stream, encoding);
}

public static MarcReader getXmlFileReader(String fileName) throws Exception {
Expand Down Expand Up @@ -78,6 +90,10 @@ public static MarcReader getReader(String fileName, boolean isMarcxml) throws Ex
}

public static MarcReader getFileReader(MarcFormat marcFormat, String fileName) throws Exception {
return getFileReader(marcFormat, fileName, null);
}

public static MarcReader getFileReader(MarcFormat marcFormat, String fileName, String encoding) throws Exception {
MarcReader reader = null;
switch (marcFormat) {
case ALEPHSEQ:
Expand All @@ -90,12 +106,16 @@ public static MarcReader getFileReader(MarcFormat marcFormat, String fileName) t
reader = ReadMarc.getMarclineFileReader(fileName); break;
case ISO:
default:
reader = ReadMarc.getIsoFileReader(fileName); break;
reader = ReadMarc.getIsoFileReader(fileName, encoding); break;
}
return reader;
}

public static MarcReader getStreamReader(MarcFormat marcFormat, InputStream stream) throws Exception {
return getStreamReader(marcFormat, stream, null);
}

public static MarcReader getStreamReader(MarcFormat marcFormat, InputStream stream, String encoding) throws Exception {
MarcReader reader = null;
switch (marcFormat) {
case ALEPHSEQ:
Expand All @@ -108,7 +128,7 @@ public static MarcReader getStreamReader(MarcFormat marcFormat, InputStream stre
reader = ReadMarc.getMarclineStreamReader(stream); break;
case ISO:
default:
reader = ReadMarc.getIsoStreamReader(stream); break;
reader = ReadMarc.getIsoStreamReader(stream, encoding); break;
}
return reader;
}
Expand Down
25 changes: 25 additions & 0 deletions src/test/java/de/gwdg/metadataqa/marc/MarcRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
import de.gwdg.metadataqa.marc.dao.Control008;
import de.gwdg.metadataqa.marc.dao.Leader;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.MarcFormat;
import de.gwdg.metadataqa.marc.definition.controltype.Control007Category;
import de.gwdg.metadataqa.marc.utils.ReadMarc;
import de.gwdg.metadataqa.marc.utils.marcspec.legacy.MarcSpec;
import org.junit.Test;
import org.marc4j.MarcReader;
import org.marc4j.marc.Record;

import java.io.IOException;
Expand Down Expand Up @@ -134,4 +136,27 @@ public void asJson() throws IOException, URISyntaxException {
assertTrue(marcRecord.asJson().contains("\"245\":[{\"ind1\":\"1\",\"ind2\":\"0\",\"subfields\":{\"a\":\"Botanical materia medica and pharmacology;\""));
}

@Test
public void testFromMek() throws Exception {
Path path = FileUtils.getPath("marc/22561.mrc");
List<Record> records = ReadMarc.read(path.toString(), "MARC8");
MarcRecord marcRecord = MarcFactory.createFromMarc4j(records.get(0));
assertEquals(' ', records.get(0).getLeader().getCharCodingScheme());
assertEquals(" ", marcRecord.getLeader().getCharacterCodingScheme().getValue());
assertEquals("Az ítélet :", marcRecord.getDatafield("245").get(0).getSubfield("a").get(0).getValue());
}

@Test
public void testFileReaderFromMek() throws Exception {
Path path = FileUtils.getPath("marc/22561.mrc");
MarcReader reader = ReadMarc.getFileReader(MarcFormat.ISO, path.toString(), "MARC8");
Record record = reader.next();
assertEquals(' ', record.getLeader().getCharCodingScheme());

MarcRecord marcRecord = MarcFactory.createFromMarc4j(record);
assertEquals(" ", marcRecord.getLeader().getCharacterCodingScheme().getValue());
assertEquals("Az ítélet :", marcRecord.getDatafield("245").get(0).getSubfield("a").get(0).getValue());
assertEquals("[Följegyzések és dokumentumok néhány magyarországi református egyházi döntésről 1948 és 1998 között] :", marcRecord.getDatafield("245").get(0).getSubfield("b").get(0).getValue());
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ public void testTrimId() {
}
}

@Test
public void testDefaultEncoding() {
String[] arguments = new String[]{"--defaultEncoding", "MARC8"};
try {
CommonParameters parameters = new CommonParameters(arguments);
assertEquals("MARC8", parameters.getDefaultEncoding());
} catch (ParseException e) {
logger.log(Level.WARNING, "error in testTrimId()", e);
}
}

@Test
public void formatParameters() {
String[] arguments = new String[]{"--trimId"};
Expand All @@ -125,7 +136,8 @@ public void formatParameters() {
"outputDir: .\n" +
"trimId: true\n" +
"ignorableFields: \n" +
"ignorableRecords: \n";
"ignorableRecords: \n" +
"defaultEncoding: null\n";
assertEquals(expected, parameters.formatParameters());
} catch (ParseException e) {
logger.log(Level.WARNING, "error in formatParameters()", e);
Expand Down
Binary file added src/test/resources/gzip/test.xml.gz
Binary file not shown.
Binary file added src/test/resources/gzip/test2.xml.gz
Binary file not shown.
1 change: 1 addition & 0 deletions src/test/resources/marc/22561.mrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
01133nmm 2200325 r 4500001001000000005001700010008004100027040001800068100004200086245017000128260000900298500001700307500001700324500001600341534006300357650003100420650002200451650002500473650003500498650003400533650005600567650001900623650002500642650001600667650002100683650002200704650002400726856003500750856002200785MEK-2256120211111233606.0211111c2021 hu j hun 1 aMEKbhuncMEK1 aBojtor Istvâand1928-0(viaf)7336547813aAz âitâelet :b[Fčoljegyzâesek âes dokumentumok nâehâany magyarorszâagi reformâatus egyhâazi dčontâesrîol 1948 âes 1998 kčozčott] :b[The Judgment] /cBojtor Istvâan0 c2021 aIllusztrâalt aemlâekiratok aOCR-es PDF. pEredeti kiad. :cSâatoraljaâujhely : Kazinczy Tâars., 2003 aEgyhâazak, egyhâazpolitika aEgyhâaztčortâenet aFelsîofokâu oktatâas aNevelâes- âes iskolatčortâenet aMagyar tčortâenelem 1946-1989 4aâallam âes egyhâaz viszonyacMagyarorszâagy20. sz. 4aâallami terror 4aszocialista rendszer 4aegyhâazčugy 4alelkâeszkâepzâes 4aegyhâazi hivatâas 4areformâatus egyhâaz40uhttp://mek.oszk.hu/22500/2256140uurn:nbn:hu-167565

0 comments on commit 57cda5e

Please sign in to comment.