Skip to content

Commit

Permalink
issue #106: adding indicators to the completeness analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 18, 2021
1 parent 57cda5e commit 3a3e860
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 24 deletions.
67 changes: 51 additions & 16 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public class Completeness implements MarcFileProcessor, Serializable {

private static final Logger logger = Logger.getLogger(Completeness.class.getCanonicalName());
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static final Pattern numericalPattern = Pattern.compile("^(\\d)$");

private final Options options;
private CompletenessParameters parameters;
Expand Down Expand Up @@ -113,6 +114,27 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
count(library, libraryCounter);
}

processLeader(marcRecord, recordFrequency, recordPackageCounter, documentType);
processSimpleControlfields(marcRecord, recordFrequency, recordPackageCounter, documentType);
processPositionalControlFields(marcRecord, recordFrequency, recordPackageCounter, documentType);
processDataFields(marcRecord, recordFrequency, recordPackageCounter, documentType);

for (String key : recordFrequency.keySet()) {
count(key, elementFrequency.get(documentType));
count(key, elementFrequency.get("all"));

fieldHistogram.computeIfAbsent(key, s -> new TreeMap<>());
count(recordFrequency.get(key), fieldHistogram.get(key));
}

for (String key : recordPackageCounter.keySet()) {
packageCounter.computeIfAbsent(documentType, s -> new TreeMap<>());
count(key, packageCounter.get(documentType));
count(key, packageCounter.get("all"));
}
}

private void processLeader(MarcRecord marcRecord, Map<String, Integer> recordFrequency, Map<String, Integer> recordPackageCounter, String documentType) {
if (marcRecord.getLeader() != null) {
for (ControlValue position : marcRecord.getLeader().getValuesList()) {
String marcPath = position.getDefinition().getId();
Expand All @@ -122,7 +144,9 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}
}

private void processSimpleControlfields(MarcRecord marcRecord, Map<String, Integer> recordFrequency, Map<String, Integer> recordPackageCounter, String documentType) {
for (MarcControlField field : marcRecord.getSimpleControlfields()) {
if (field != null) {
String marcPath = field.getDefinition().getTag();
Expand All @@ -132,7 +156,9 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
count(TagCategory.tags00x.getPackageName(), recordPackageCounter);
}
}
}

private void processPositionalControlFields(MarcRecord marcRecord, Map<String, Integer> recordFrequency, Map<String, Integer> recordPackageCounter, String documentType) {
for (MarcPositionalControlField field : marcRecord.getPositionalControlfields()) {
if (field != null) {
for (ControlValue position : field.getValuesList()) {
Expand All @@ -144,34 +170,42 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
}
}
}
}

private void processDataFields(MarcRecord marcRecord, Map<String, Integer> recordFrequency, Map<String, Integer> recordPackageCounter, String documentType) {
for (DataField field : marcRecord.getDatafields()) {
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;

count(getPackageName(field), recordPackageCounter);

for (MarcSubfield subfield : field.getSubfields()) {
String marcPath = String.format("%s$%s", field.getTag(), subfield.getCode());
List<String> marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, elementCardinality.get(documentType));
count(marcPath, elementCardinality.get("all"));
count(marcPath, recordFrequency);
}
}
}

for (String key : recordFrequency.keySet()) {
count(key, elementFrequency.get(documentType));
count(key, elementFrequency.get("all"));
private List<String> getMarcPaths(DataField field) {
List<String> marcPaths = new ArrayList<>();

fieldHistogram.computeIfAbsent(key, s -> new TreeMap<>());
count(recordFrequency.get(key), fieldHistogram.get(key));
}
if (field.getInd1() != null)
if (field.getDefinition().getInd1().exists() || !field.getInd1().equals(" "))
marcPaths.add(String.format("%s$!ind1", field.getTag()));

for (String key : recordPackageCounter.keySet()) {
packageCounter.computeIfAbsent(documentType, s -> new TreeMap<>());
count(key, packageCounter.get(documentType));
count(key, packageCounter.get("all"));
}
if (field.getInd2() != null)
if (field.getDefinition().getInd2().exists() || !field.getInd2().equals(" "))
marcPaths.add(String.format("%s$!ind2", field.getTag()));

for (MarcSubfield subfield : field.getSubfields())
if (numericalPattern.matcher(subfield.getCode()).matches())
marcPaths.add(String.format("%s$|%s", field.getTag(), subfield.getCode()));
else
marcPaths.add(String.format("%s$%s", field.getTag(), subfield.getCode()));

return marcPaths;
}

private String getPackageName(DataField field) {
Expand Down Expand Up @@ -357,7 +391,8 @@ private String formatCardinality(char separator,
logger.severe("Empty key from " + marcPath);
}

TagHierarchy tagHierarchy = TagHierarchy.createFromPath(marcPath, parameters.getMarcVersion());
String marcPathLabel = marcPath.replace("!ind", "ind").replaceAll("\\|(\\d)$", "$1");
TagHierarchy tagHierarchy = TagHierarchy.createFromPath(marcPathLabel, parameters.getMarcVersion());
int packageId;
String packageLabel = "";
String tagLabel = "";
Expand All @@ -368,7 +403,7 @@ private String formatCardinality(char separator,
tagLabel = tagHierarchy.getTagLabel();
subfieldLabel = tagHierarchy.getSubfieldLabel();
} else {
logger.severe("Key can not be found in the TagHierarchy: " + marcPath);
logger.severe("Key can not be found in the TagHierarchy: " + marcPathLabel);
packageId = TagCategory.other.getId();
packageLabel = TagCategory.other.getLabel();
}
Expand All @@ -382,7 +417,7 @@ private String formatCardinality(char separator,

List<Object> values = quote(
Arrays.asList(
documentType, marcPath, packageId, packageLabel, tagLabel, subfieldLabel,
documentType, marcPathLabel, packageId, packageLabel, tagLabel, subfieldLabel,
frequency, cardinality,
statistics.getMin(), statistics.getMax(),
statistics.getMean(), statistics.getStdDev(),
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/de/gwdg/metadataqa/marc/dao/DataField.java
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,17 @@ public Map<String, List<String>> getKeyValuePairs(SolrFieldType type) {
definition, type, getTag()
);

String value = (definition != null && definition.getInd1().exists()) ? resolveInd1() : (getInd1() != null ? getInd1() : null);
if (value != null && StringUtils.isNotBlank(value))
boolean hasInd1def = (definition != null && definition.getInd1().exists());
if (hasInd1def || !getInd1().equals(" ")) {
String value = hasInd1def ? resolveInd1() : getInd1();
pairs.put(keyGenerator.forInd1(), Arrays.asList(value));
}

value = (definition != null && definition.getInd2().exists()) ? resolveInd2() : (getInd2() != null ? getInd2() : null);
if (value != null && StringUtils.isNotBlank(value))
boolean hasInd2def = (definition != null && definition.getInd2().exists());
if (hasInd2def || !getInd2().equals(" ")) {
String value = hasInd2def ? resolveInd2() : getInd2();
pairs.put(keyGenerator.forInd2(), Arrays.asList(value));
}

for (MarcSubfield subfield : subfields)
pairs.putAll(subfield.getKeyValuePairs(keyGenerator));
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/de/gwdg/metadataqa/marc/dao/MarcRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -342,10 +342,9 @@ public Map<String, List<String>> getKeyValuePairs(SolrFieldType type,
mainKeyValuePairs.put("type", Arrays.asList(getType().getValue()));
mainKeyValuePairs.putAll(leader.getKeyValuePairs(type));

for (MarcControlField controlField : getControlfields()) {
for (MarcControlField controlField : getControlfields())
if (controlField != null)
mainKeyValuePairs.putAll(controlField.getKeyValuePairs(type));
}

for (DataField field : datafields) {
Map<String, List<String>> keyValuePairs = field.getKeyValuePairs(type);
Expand Down
14 changes: 12 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/utils/TagHierarchy.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.gwdg.metadataqa.marc.definition.structure.ControlfieldPositionDefinition;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.structure.Indicator;
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
import de.gwdg.metadataqa.marc.definition.TagDefinitionLoader;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
Expand Down Expand Up @@ -115,8 +116,17 @@ public static TagHierarchy createFromPath(String path, MarcVersion version) {
if (definition != null) {
String tagLabel = definition.getLabel();

SubfieldDefinition subfield = definition.getSubfield(subfieldCode);
String subfieldLabel = subfield != null ? subfield.getLabel() : "";
String subfieldLabel = "";
if (subfieldCode.equals("ind1")) {
Indicator indicator = definition.getInd1();
subfieldLabel = indicator.exists() ? indicator.getLabel() : "";
} else if (subfieldCode.equals("ind2")) {
Indicator indicator = definition.getInd2();
subfieldLabel = indicator.exists() ? indicator.getLabel() : "";
} else {
SubfieldDefinition subfield = definition.getSubfield(subfieldCode);
subfieldLabel = subfield != null ? subfield.getLabel() : "";
}

String packageName = Utils.extractPackageName(definition);
TagCategory category = TagCategory.getPackage(packageName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,13 @@ public void testSimple() {

namesMap.forEach((key, value) -> System.out.println(key + " " + value));
}

@Test
public void testRegex() {
String a = "041$_ind1";
assertEquals("041$ind1", a.replace("_ind", "ind"));

String b = "041$|0";
assertEquals("041$0", b.replaceAll("\\|(\\d)$", "$1"));
}
}

0 comments on commit 3a3e860

Please sign in to comment.