Skip to content

Commit

Permalink
issue #83: Include control fields in completeness calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jan 28, 2021
1 parent ffc712e commit ede5900
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 30 deletions.
55 changes: 26 additions & 29 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
if (!elementCardinality.containsKey(documentType))
elementCardinality.put(documentType, new TreeMap<>());

if (!elementFrequency.containsKey(documentType))
elementFrequency.put(documentType, new TreeMap<>());

if (marcRecord.getControl003() != null)
count(marcRecord.getControl003().getContent(), library003Counter);

Expand All @@ -116,20 +119,18 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
String marcPath = field.getDefinition().getTag();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}

for (MarcPositionalControlField field : marcRecord.getPositionalControlfields()) {
// String marcPath = field.getDefinition().getTag();
// count(marcPath, elementCardinality.get(documentType));
// count(marcPath, elementCardinality.get("all"));
if (field != null) {
for (ControlValue position : field.getValuesList()) {
// String marcPath = position.getDefinition().getPath(false);
String marcPath = position.getDefinition().getId();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}
Expand All @@ -139,17 +140,7 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;

String packageName;
if (field.getDefinition() != null) {
packageName = Utils.extractPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.other.getPackageName();
}
} else {
packageName = TagCategory.other.getPackageName();
}
count(packageName, recordPackageCounter);
count(getPackageName(field), recordPackageCounter);

for (MarcSubfield subfield : field.getSubfields()) {
String marcPath = String.format("%s$%s", field.getTag(), subfield.getCode());
Expand All @@ -160,14 +151,11 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
}

for (String key : recordFrequency.keySet()) {
if (!elementFrequency.containsKey(documentType))
elementFrequency.put(documentType, new TreeMap<>());
count(key, elementFrequency.get(documentType));
count(key, elementFrequency.get("all"));

if (!fieldHistogram.containsKey(key)) {
if (!fieldHistogram.containsKey(key))
fieldHistogram.put(key, new TreeMap<>());
}

count(recordFrequency.get(key), fieldHistogram.get(key));
}
Expand All @@ -180,6 +168,20 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
}
}

private String getPackageName(DataField field) {
String packageName;
if (field.getDefinition() != null) {
packageName = Utils.extractPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.other.getPackageName();
}
} else {
packageName = TagCategory.other.getPackageName();
}
return packageName;
}

private List<String> extract(MarcRecord marcRecord, String tag, String subfield) {
List<String> values = new ArrayList<>();
List<DataField> fields = marcRecord.getDatafield(tag);
Expand Down Expand Up @@ -268,16 +270,11 @@ private void saveMarcElements(String fileExtension, char separator) {
System.err.println("MARC elements");
path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension);
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
writer.write(
StringUtils.join(
Arrays.asList(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
),
separator
) + "\n"
);
writer.write(createRow(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
));
elementCardinality
.keySet()
.stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public class TagHierarchy {

private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static Pattern controlFieldPattern = Pattern.compile("^(00\\d)(/(\\d+|\\d+-\\d+))?$");
private static Pattern controlFieldIdPattern = Pattern.compile("^(00[6-8])([a-z]+)(\\d+)$");
private static Pattern controlFieldIdPattern = Pattern.compile("^(00[6-8])([a-z][a-zA-Z]+)(\\d+)$");

private TagCategory category;
private String tagLabel;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,15 @@ public void testTagHierarchySimpleControlField_id_007() {
assertEquals("Specific material designation", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_id_007_withUppercase() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("007motionPicture06");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("Physical Description", tagHierarchy.getTagLabel());
assertEquals("Medium for sound", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_id_008() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("008all07");
Expand Down

0 comments on commit ede5900

Please sign in to comment.