Skip to content

Commit

Permalink
issue #75: add British Library tags
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Dec 3, 2020
1 parent c95515e commit 73b4309
Show file tree
Hide file tree
Showing 17 changed files with 330 additions and 120 deletions.
2 changes: 1 addition & 1 deletion scripts/gent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ NAME=gent
# MARC_DIR=${BASE_INPUT_DIR}/gent/marc/2019-06-05
# MASK=*.mrc
TYPE_PARAMS="--marcVersion GENT --alephseq"
MARC_DIR=${BASE_INPUT_DIR}/gent/marc/2020-09-19
MARC_DIR=${BASE_INPUT_DIR}/gent/marc/2020-05-27
MASK=*.export

. ./common-script
Expand Down
106 changes: 49 additions & 57 deletions src/main/java/de/gwdg/metadataqa/marc/DataField.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import de.gwdg.metadataqa.marc.definition.general.parser.LinkageParser;
import de.gwdg.metadataqa.marc.definition.general.parser.ParserException;
import de.gwdg.metadataqa.marc.model.SolrFieldType;
import de.gwdg.metadataqa.marc.model.validation.ErrorsCollector;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorType;
import de.gwdg.metadataqa.marc.utils.keygenerator.DataFieldKeyGenerator;
Expand All @@ -16,6 +17,8 @@
import java.util.*;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.model.validation.ValidationErrorType.*;

public class DataField implements Extractable, Validatable, Serializable {

private static final Logger logger = Logger.getLogger(DataField.class.getCanonicalName());
Expand All @@ -26,7 +29,7 @@ public class DataField implements Extractable, Validatable, Serializable {
private String ind2;
private List<MarcSubfield> subfields;
private Map<String, List<MarcSubfield>> subfieldIndex = new LinkedHashMap<>();
private List<ValidationError> validationErrors = null;
private ErrorsCollector errors = null;
private List<String> unhandledSubfields = null;
private MarcRecord record;

Expand Down Expand Up @@ -384,26 +387,28 @@ public DataFieldDefinition getDefinition() {
@Override
public boolean validate(MarcVersion marcVersion) {
boolean isValid = true;
validationErrors = new ArrayList<>();
errors = new ErrorsCollector();
DataFieldDefinition referencerDefinition = null;
List<MarcSubfield> _subfields = null;
boolean ambiguousLinkage = false;

if (marcVersion == null)
marcVersion = MarcVersion.MARC21;

if (TagDefinitionLoader.load(definition.getTag(), marcVersion) == null) {
addError(FIELD_UNDEFINED, "");
return false;
}

if (getTag().equals("880")) {
List<MarcSubfield> subfield6s = getSubfield("6");
if (subfield6s == null) {
validationErrors.add(new ValidationError(record.getId(), definition.getTag(),
ValidationErrorType.FIELD_MISSING_REFERENCE_SUBFIELD, "$6", definition.getDescriptionUrl()));
addError(FIELD_MISSING_REFERENCE_SUBFIELD, "$6");
isValid = false;
} else {
if (!subfield6s.isEmpty()) {
if (subfield6s.size() != 1) {
validationErrors.add(
new ValidationError(
record.getId(), definition.getTag() + "$6",
ValidationErrorType.RECORD_AMBIGUOUS_LINKAGE, "There are multiple $6",
definition.getDescriptionUrl()
)
);
addError(definition.getTag() + "$6", RECORD_AMBIGUOUS_LINKAGE, "There are multiple $6");
isValid = false;
ambiguousLinkage = true;
} else {
Expand All @@ -412,25 +417,17 @@ public boolean validate(MarcVersion marcVersion) {
try {
linkage = LinkageParser.getInstance().create(subfield6.getValue());
if (linkage == null || linkage.getLinkingTag() == null) {
validationErrors.add(
new ValidationError(
record.getId(), definition.getTag() + "$6",
ValidationErrorType.RECORD_INVALID_LINKAGE,
String.format("Unparseable reference: '%s'", subfield6.getValue()),
definition.getDescriptionUrl()
)
);
String message = String.format("Unparseable reference: '%s'", subfield6.getValue());
addError(RECORD_INVALID_LINKAGE, message);
} else {
referencerDefinition = definition;
definition = TagDefinitionLoader.load(linkage.getLinkingTag(), marcVersion);

if (definition == null) {
definition = referencerDefinition;
validationErrors.add(
new ValidationError(
record.getId(), definition.getTag() + "$6",
ValidationErrorType.RECORD_INVALID_LINKAGE,
String.format("refers to field %s, which is not defined", linkage.getLinkingTag()),
definition.getDescriptionUrl()));
String message = String.format("refers to field %s, which is not defined",
linkage.getLinkingTag());
addError(definition.getTag() + "$6", RECORD_INVALID_LINKAGE, message);
isValid = false;
} else {
_subfields = subfields;
Expand All @@ -451,23 +448,15 @@ public boolean validate(MarcVersion marcVersion) {
}
}
} catch (ParserException e) {
validationErrors.add(
new ValidationError(
record.getId(), definition.getTag() + "$6",
ValidationErrorType.RECORD_INVALID_LINKAGE, e.getMessage(),
definition.getDescriptionUrl()
)
);
addError(definition.getTag() + "$6", RECORD_INVALID_LINKAGE, e.getMessage());
}
}
}
}
}

if (unhandledSubfields != null) {
validationErrors.add(new ValidationError(record.getId(), definition.getTag(),
ValidationErrorType.SUBFIELD_UNDEFINED, StringUtils.join(unhandledSubfields, ", "),
definition.getDescriptionUrl()));
addError(SUBFIELD_UNDEFINED, StringUtils.join(unhandledSubfields, ", "));
isValid = false;
}

Expand All @@ -490,21 +479,21 @@ public boolean validate(MarcVersion marcVersion) {
definition.getVersionSpecificSubfield(
marcVersion, subfield.getCode()));
} else {
validationErrors.add(
new ValidationError(
record.getId(), definition.getTag(),
ValidationErrorType.SUBFIELD_UNDEFINED, subfield.getCode(), definition.getDescriptionUrl()));
addError(SUBFIELD_UNDEFINED, subfield.getCode());
isValid = false;
continue;
}
}
Utils.count(subfield.getDefinition(), counter);
/*
if (!counter.containsKey(subfield.getDefinition())) {
counter.put(subfield.getDefinition(), 0);
}
counter.put(subfield.getDefinition(), counter.get(subfield.getDefinition()) + 1);
*/

if (!subfield.validate(marcVersion)) {
validationErrors.addAll(subfield.getValidationErrors());
errors.addAll(subfield.getValidationErrors());
isValid = false;
}
}
Expand All @@ -513,11 +502,9 @@ public boolean validate(MarcVersion marcVersion) {
SubfieldDefinition subfieldDefinition = entry.getKey();
Integer count = entry.getValue();
if (count > 1
&& subfieldDefinition.getCardinality().equals(Cardinality.Nonrepeatable)) {
validationErrors.add(new ValidationError(record.getId(), subfieldDefinition.getPath(),
ValidationErrorType.SUBFIELD_NONREPEATABLE,
String.format("there are %d instances", count),
definition.getDescriptionUrl()));
&& subfieldDefinition.getCardinality().equals(Cardinality.Nonrepeatable)) {
addError(subfieldDefinition, SUBFIELD_NONREPEATABLE,
String.format("there are %d instances", count));
isValid = false;
}
}
Expand Down Expand Up @@ -545,24 +532,16 @@ private boolean validateIndicator(Indicator indicatorDefinition,
if (!indicatorDefinition.isVersionSpecificCode(marcVersion, value)) {
isValid = false;
if (indicatorDefinition.isHistoricalCode(value)) {
validationErrors.add(
new ValidationError(
record.getId(),
path,
ValidationErrorType.INDICATOR_OBSOLETE,
value,
definition.getDescriptionUrl()));
addError(path, INDICATOR_OBSOLETE, value);
} else {
validationErrors.add(new ValidationError(record.getId(), path,
ValidationErrorType.INDICATOR_INVALID_VALUE, value, definition.getDescriptionUrl()));
addError(path, INDICATOR_INVALID_VALUE, value);
}
}
}
} else {
if (!value.equals(" ")) {
if (!indicatorDefinition.isVersionSpecificCode(marcVersion, value)) {
validationErrors.add(new ValidationError(record.getId(), path,
ValidationErrorType.INDICATOR_NON_EMPTY, value, definition.getDescriptionUrl()));
addError(path, INDICATOR_NON_EMPTY, value);
isValid = false;
}
}
Expand All @@ -576,7 +555,7 @@ public DataFieldKeyGenerator getKeyGenerator(SolrFieldType type) {

@Override
public List<ValidationError> getValidationErrors() {
return validationErrors;
return errors.getErrors();
}

public void addUnhandledSubfields(String code) {
Expand All @@ -585,6 +564,19 @@ public void addUnhandledSubfields(String code) {
unhandledSubfields.add(code);
}

private void addError(ValidationErrorType type, String message) {
addError(definition.getTag(), type, message);
}

private void addError(SubfieldDefinition subfieldDefinition, ValidationErrorType type, String message) {
addError(subfieldDefinition.getPath(), type, message);
}

private void addError(String path, ValidationErrorType type, String message) {
String url = definition.getDescriptionUrl();
errors.add(record.getId(), path, type, message, url);
}

@Override
public String toString() {
return "DataField{"
Expand Down
60 changes: 28 additions & 32 deletions src/main/java/de/gwdg/metadataqa/marc/MarcSubfield.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.gwdg.metadataqa.marc.definition.general.parser.ParserException;
import de.gwdg.metadataqa.marc.definition.general.parser.SubfieldContentParser;
import de.gwdg.metadataqa.marc.definition.general.validator.SubfieldValidator;
import de.gwdg.metadataqa.marc.model.validation.ErrorsCollector;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorType;
import de.gwdg.metadataqa.marc.utils.keygenerator.DataFieldKeyGenerator;
Expand All @@ -13,17 +14,19 @@
import java.util.*;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.model.validation.ValidationErrorType.*;

public class MarcSubfield implements Validatable, Serializable {

private static final Logger logger = Logger.getLogger(MarcSubfield.class.getCanonicalName());

private MarcRecord record;
private DataField field;
private SubfieldDefinition definition;
private String code;
private String value;
private final String code;
private final String value;
private String codeForIndex = null;
private List<ValidationError> validationErrors = null;
private ErrorsCollector errors = null;
private Linkage linkage;
private String referencePath;

Expand Down Expand Up @@ -124,7 +127,7 @@ public Map<String, List<String>> getKeyValuePairs(DataFieldKeyGenerator keyGener
Map<String, List<String>> pairs = new HashMap<>();
String prefix = keyGenerator.forSubfield(this);

pairs.put(prefix, Arrays.asList(resolve()));
pairs.put(prefix, Collections.singletonList(resolve()));
if (getDefinition() != null) {
getKeyValuePairsForPositionalSubfields(pairs, prefix);
getKeyValuePairsFromContentParser(keyGenerator, pairs);
Expand All @@ -140,7 +143,7 @@ private void getKeyValuePairsFromContentParser(DataFieldKeyGenerator keyGenerato
for (String key : extra.keySet()) {
pairs.put(
keyGenerator.forSubfield(this, key),
Arrays.asList(extra.get(key))
Collections.singletonList(extra.get(key))
);
}
}
Expand All @@ -151,25 +154,24 @@ private void getKeyValuePairsForPositionalSubfields(Map<String, List<String>> pa
if (getDefinition().hasPositions()) {
Map<String, String> extra = getDefinition().resolvePositional(getValue());
for (String key : extra.keySet()) {
pairs.put(prefix + "_" + key, Arrays.asList(extra.get(key)));
pairs.put(prefix + "_" + key, Collections.singletonList(extra.get(key)));
}
}
}


@Override
public boolean validate(MarcVersion marcVersion) {
boolean isValid = true;
validationErrors = new ArrayList<>();
errors = new ErrorsCollector();
if (marcVersion == null)
marcVersion = MarcVersion.MARC21;

if (definition == null) {
validationErrors.add(new ValidationError(record.getId(), field.getDefinition().getTag(),
ValidationErrorType.SUBFIELD_UNDEFINED, code, field.getDefinition().getDescriptionUrl()));
isValid = false;
addError(field.getDefinition().getTag(), SUBFIELD_UNDEFINED, code);
return false;
} else {
if (code == null) {
validationErrors.add(new ValidationError(record.getId(), field.getDefinition().getTag(),
ValidationErrorType.SUBFIELD_NULL_CODE, code, field.getDefinition().getDescriptionUrl()));
addError(field.getDefinition().getTag(), SUBFIELD_NULL_CODE, code);
isValid = false;
} else {
if (definition.hasValidator()) {
Expand All @@ -184,15 +186,7 @@ public boolean validate(MarcVersion marcVersion) {
message += String.format(" (the field is embedded in %s)", referencePath);
}
String path = (referencePath == null ? definition.getPath() : referencePath + "->" + definition.getPath());
validationErrors.add(
new ValidationError(
record.getId(),
path,
ValidationErrorType.SUBFIELD_INVALID_VALUE,
message,
definition.getParent().getDescriptionUrl()
)
);
addError(path, ValidationErrorType.SUBFIELD_INVALID_VALUE, message);
isValid = false;
}
}
Expand All @@ -206,7 +200,7 @@ private boolean validateWithValidator() {
SubfieldValidator validator = definition.getValidator();
ValidatorResponse response = validator.isValid(this);
if (!response.isValid()) {
validationErrors.addAll(response.getValidationErrors());
errors.addAll(response.getValidationErrors());
isValid = false;
}
return isValid;
Expand All @@ -218,22 +212,24 @@ private boolean validateWithParser() {
try {
parser.parse(getValue());
} catch (ParserException e) {
validationErrors.add(
new ValidationError(
record.getId(),
definition.getPath(),
ValidationErrorType.SUBFIELD_UNPARSABLE_CONTENT,
e.getMessage(),
definition.getParent().getDescriptionUrl()
));
addError(SUBFIELD_UNPARSABLE_CONTENT, e.getMessage());
isValid = false;
}
return isValid;
}

@Override
public List<ValidationError> getValidationErrors() {
return validationErrors;
return errors.getErrors();
}

private void addError(ValidationErrorType type, String message) {
addError(definition.getPath(), type, message);
}

private void addError(String path, ValidationErrorType type, String message) {
String url = definition.getParent().getDescriptionUrl();
errors.add(record.getId(), path, type, message, url);
}

@Override
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ public static MarcVersion getVersion(Class<? extends DataFieldDefinition> field)
public static MarcVersion package2version(String packageName) {
MarcVersion version = MarcVersion.MARC21;
switch (packageName) {
case "bltags": version = MarcVersion.BL; break;
case "dnbtags": version = MarcVersion.DNB; break;
case "fennicatags": version = MarcVersion.FENNICA; break;
case "oclctags": version = MarcVersion.OCLC; break;
case "genttags": version = MarcVersion.GENT; break;
case "dnbtags": version = MarcVersion.DNB; break;
case "oclctags": version = MarcVersion.OCLC; break;
case "sztetags": version = MarcVersion.SZTE; break;
case "nkcrtags": version = MarcVersion.NKCR; break;
}
Expand Down
Loading

0 comments on commit 73b4309

Please sign in to comment.