Skip to content

Commit

Permalink
issue #83: Include control fields in completeness calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jan 26, 2021
1 parent 7b327ce commit 7efb454
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 13 deletions.
34 changes: 29 additions & 5 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.cli.processor.MarcFileProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat;
import de.gwdg.metadataqa.marc.utils.BasicStatistics;
Expand Down Expand Up @@ -100,13 +101,39 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
// private Map<String, Map<String, Integer>>

String documentType = marcRecord.getType().getValue();
if (!elementCardinality.containsKey(documentType))
elementCardinality.put(documentType, new TreeMap<>());

if (marcRecord.getControl003() != null)
count(marcRecord.getControl003().getContent(), library003Counter);

for (String library : extract(marcRecord, "852", "a")) {
count(library, libraryCounter);
}

for (MarcControlField field : marcRecord.getSimpleControlfields()) {
if (field != null) {
String marcPath = field.getDefinition().getTag();
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}

for (MarcPositionalControlField field : marcRecord.getPositionalControlfields()) {
// String marcPath = field.getDefinition().getTag();
// count(marcPath, elementCardinality.get(documentType));
// count(marcPath, elementCardinality.get("all"));
if (field != null) {
for (ControlValue position : field.getValuesList()) {
String marcPath = position.getDefinition().getPath(false);
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}
}

for (DataField field : marcRecord.getDatafields()) {
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;
Expand All @@ -125,8 +152,6 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce

for (MarcSubfield subfield : field.getSubfields()) {
String marcPath = String.format("%s$%s", field.getTag(), subfield.getCode());
if (!elementCardinality.containsKey(documentType))
elementCardinality.put(documentType, new TreeMap<>());
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
Expand Down Expand Up @@ -312,9 +337,8 @@ separator, quote(documentType), id, quote(range), quote(label), isPartOfMarcScor
}

private void saveLibraries(String fileExtension, char separator) {
Path path;
System.err.println("Libraries");
path = Paths.get(parameters.getOutputDir(), "libraries" + fileExtension);
logger.info("Saving Libraries");
Path path = Paths.get(parameters.getOutputDir(), "libraries" + fileExtension);
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
writer.write("library" + separator + "count\n");
libraryCounter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

public enum TagCategory {

tags00x(0, "tags00x", "00X", "Control Fields", true),
tags01x(1, "tags01x", "01X-09X", "Numbers and Code", true),
tags1xx(2, "tags1xx", "1XX", "Main Entry", true),
tags20x(3, "tags20x", "20X-24X", "Title", true),
Expand Down
55 changes: 47 additions & 8 deletions src/main/java/de/gwdg/metadataqa/marc/utils/TagHierarchy.java
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
package de.gwdg.metadataqa.marc.utils;

import de.gwdg.metadataqa.marc.SimpleControlField;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
import de.gwdg.metadataqa.marc.definition.TagDefinitionLoader;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.definition.tags.control.Control001Definition;
import de.gwdg.metadataqa.marc.definition.tags.control.Control003Definition;
import de.gwdg.metadataqa.marc.definition.tags.control.Control005Definition;
import de.gwdg.metadataqa.marc.definition.tags.control.Control006Definition;
import de.gwdg.metadataqa.marc.definition.tags.control.Control007Definition;
import de.gwdg.metadataqa.marc.definition.tags.control.Control008Definition;
import org.apache.commons.lang3.StringUtils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TagHierarchy {

private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static Pattern controlFieldPattern = Pattern.compile("^(00\\d)(/(\\d+|\\d+-\\d+))?$");

private TagCategory category;
private String tagLabel;
Expand Down Expand Up @@ -48,22 +57,52 @@ public static TagHierarchy createFromPath(String path) {
}

public static TagHierarchy createFromPath(String path, MarcVersion version) {
Matcher matcher = dataFieldPattern.matcher(path);
Matcher matcher;
matcher = controlFieldPattern.matcher(path);
if (matcher.matches()) {
String tag = matcher.group(1);
String subfieldCode = matcher.group(2);
String position = matcher.group(3);
DataFieldDefinition definition = null;
if (tag.equals("001"))
definition = Control001Definition.getInstance();
else if (tag.equals("003"))
definition = Control003Definition.getInstance();
else if (tag.equals("005"))
definition = Control005Definition.getInstance();
else if (tag.equals("006"))
definition = Control006Definition.getInstance();
else if (tag.equals("007"))
definition = Control007Definition.getInstance();
else if (tag.equals("008"))
definition = Control008Definition.getInstance();

DataFieldDefinition definition = TagDefinitionLoader.load(tag, version);
if (definition != null) {
String tagLabel = definition.getLabel();

SubfieldDefinition subfield = definition.getSubfield(subfieldCode);
String subfieldLabel = subfield != null ? subfield.getLabel() : "";
String subfieldLabel = "";
if (StringUtils.isNotBlank(position)) {
subfieldLabel = position;
}
return new TagHierarchy(TagCategory.tags00x, tagLabel, subfieldLabel);
}
} else {
matcher = dataFieldPattern.matcher(path);
if (matcher.matches()) {
String tag = matcher.group(1);
String subfieldCode = matcher.group(2);

DataFieldDefinition definition = TagDefinitionLoader.load(tag, version);
if (definition != null) {
String tagLabel = definition.getLabel();

SubfieldDefinition subfield = definition.getSubfield(subfieldCode);
String subfieldLabel = subfield != null ? subfield.getLabel() : "";

String packageName = Utils.extractPackageName(definition);
TagCategory category = TagCategory.getPackage(packageName);
String packageName = Utils.extractPackageName(definition);
TagCategory category = TagCategory.getPackage(packageName);

return new TagHierarchy(category, tagLabel, subfieldLabel);
return new TagHierarchy(category, tagLabel, subfieldLabel);
}
}
}
return null;
Expand Down
46 changes: 46 additions & 0 deletions src/test/java/de/gwdg/metadataqa/marc/utils/TagHierarchyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,50 @@ public void createFromPath_withGent() {
assertEquals("Locally defined field in Gent", tagHierarchy.getTagLabel());
assertEquals("Value", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_001() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("001");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("Control Number", tagHierarchy.getTagLabel());
assertEquals("", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_003() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("003");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("Control Number Identifier", tagHierarchy.getTagLabel());
assertEquals("", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_005() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("005");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("Date and Time of Latest Transaction", tagHierarchy.getTagLabel());
assertEquals("", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_006() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("006/16-17");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("Additional Material Characteristics", tagHierarchy.getTagLabel());
assertEquals("16-17", tagHierarchy.getSubfieldLabel());
}

@Test
public void testTagHierarchySimpleControlField_008() {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath("008/00-05");
assertNotNull(tagHierarchy);
assertEquals("Control Fields", tagHierarchy.getPackageLabel());
assertEquals("General Information", tagHierarchy.getTagLabel());
assertEquals("00-05", tagHierarchy.getSubfieldLabel());
}

}

0 comments on commit 7efb454

Please sign in to comment.