Skip to content

Commit

Permalink
issue #75: reformat and sort validator output.
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Dec 21, 2020
1 parent b629508 commit 82436c5
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 46 deletions.
71 changes: 52 additions & 19 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.gwdg.metadataqa.marc.cli.processor.MarcFileProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormatter;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorType;
import org.apache.commons.cli.HelpFormatter;
Expand Down Expand Up @@ -41,8 +42,8 @@ public class Validator implements MarcFileProcessor, Serializable {
private ValidatorParameters parameters;
private Map<Integer, Integer> totalRecordCounter = new HashMap<>();
private Map<Integer, Integer> totalInstanceCounter = new HashMap<>();
private Map<String, Integer> categoryRecordCounter = new HashMap<>();
private Map<String, Integer> categoryInstanceCounter = new HashMap<>();
private Map<ValidationErrorCategory, Integer> categoryRecordCounter = new HashMap<>();
private Map<ValidationErrorCategory, Integer> categoryInstanceCounter = new HashMap<>();
private Map<ValidationErrorType, Integer> typeRecordCounter = new HashMap<>();
private Map<ValidationErrorType, Integer> typeInstanceCounter = new HashMap<>();
private Map<ValidationError, Integer> instanceBasedErrorCounter = new HashMap<>();
Expand Down Expand Up @@ -174,7 +175,7 @@ public void processRecord(MarcRecord marcRecord, int i) {
List<ValidationError> allButInvalidFieldErrors = new ArrayList<>();
Set<Integer> uniqueErrors = new HashSet<>();
Set<ValidationErrorType> uniqueTypes = new HashSet<>();
Set<String> uniqueCategories = new HashSet<>();
Set<ValidationErrorCategory> uniqueCategories = new HashSet<>();
for (ValidationError error : errors) {
if (!instanceBasedErrorCounter.containsKey(error)) {
error.setId(vErrorId++);
Expand Down Expand Up @@ -220,7 +221,7 @@ public void processRecord(MarcRecord marcRecord, int i) {
for (ValidationErrorType id : uniqueTypes) {
count(id, typeRecordCounter);
}
for (String id : uniqueCategories) {
for (ValidationErrorCategory id : uniqueCategories) {
count(id, categoryRecordCounter);
}
count(1, totalRecordCounter);
Expand Down Expand Up @@ -299,6 +300,33 @@ private void printSummary(char separator) {
parameters.getFormat()
);
print(summaryFile, header + "\n");
instanceBasedErrorCounter
.entrySet()
.stream()
.sorted((a,b) -> {
Integer typeIdA = Integer.valueOf(a.getKey().getType().getId());
Integer typeIdB = Integer.valueOf(b.getKey().getType().getId());
int result = typeIdA.compareTo(typeIdB);
if (result == 0) {
Integer recordCountA = Integer.valueOf(recordBasedErrorCounter.get(a.getKey().getId()));
Integer recordCountB = Integer.valueOf(recordBasedErrorCounter.get(b.getKey().getId()));
result = recordCountB.compareTo(recordCountA);
}
return result;
})
.forEach(
entry -> {
ValidationError error = entry.getKey();
int instanceCount = entry.getValue();
String formattedOutput = ValidationErrorFormatter.formatForSummary(
error, parameters.getFormat()
);
print(summaryFile, createRow(
separator, error.getId(), formattedOutput, instanceCount, recordBasedErrorCounter.get(error.getId())
));
}
);
/*
for (Map.Entry<ValidationError, Integer> entry : instanceBasedErrorCounter.entrySet()) {
ValidationError error = entry.getKey();
int count = entry.getValue();
Expand All @@ -309,21 +337,25 @@ private void printSummary(char separator) {
separator, error.getId(), formattedOutput, count, recordBasedErrorCounter.get(error.getId())
));
}
*/
}

private void printTypeCounts() {
Path path = Paths.get(parameters.getOutputDir(), "issue-by-type.csv");
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
writer.write(createRow("id", "type", "instances", "records"));
writer.write(createRow("id", "categoryId", "category", "type", "instances", "records"));
typeRecordCounter
.entrySet()
.stream()
.sorted((a, b) -> ((Integer)a.getKey().getId()).compareTo((Integer) b.getKey().getId()))
.forEach(entry -> {
ValidationErrorType type = entry.getKey();
int records = entry.getValue();
int instances = typeInstanceCounter.get(entry.getKey());
try {
writer.write(createRow(type.getId(), quote(type.getMessage()), instances, records));
writer.write(createRow(
type.getId(), type.getCategory().getId(), type.getCategory().getName(), quote(type.getMessage()), instances, records
));
} catch (IOException e) {
e.printStackTrace();
}
Expand Down Expand Up @@ -358,20 +390,21 @@ private void printTotalCounts() {
private void printCategoryCounts() {
Path path = Paths.get(parameters.getOutputDir(), "issue-by-category.csv");
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
writer.write(createRow("category", "instances", "records"));
writer.write(createRow("id", "category", "instances", "records"));
categoryRecordCounter
.entrySet()
.stream()
.forEach(entry -> {
String category = entry.getKey();
int records = entry.getValue();
int instances = categoryInstanceCounter.getOrDefault(entry.getKey(), -1);
try {
writer.write(createRow(category, instances, records));
} catch (IOException e) {
e.printStackTrace();
}
});
.entrySet()
.stream()
.sorted((a, b) -> ((Integer)a.getKey().getId()).compareTo((Integer) b.getKey().getId()))
.forEach(entry -> {
ValidationErrorCategory category = entry.getKey();
int records = entry.getValue();
int instances = categoryInstanceCounter.getOrDefault(entry.getKey(), -1);
try {
writer.write(createRow(category.getId(), category.getName(), instances, records));
} catch (IOException e) {
e.printStackTrace();
}
});
} catch (IOException e) {
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package de.gwdg.metadataqa.marc.model.validation;

public enum ValidationErrorCategory {

RECORD(1, "record"),
CONTROLFIELD(2, "control field"),
DATAFIELD(3, "data field"),
INDICATOR(4, "indicator"),
SUBFIELD(5, "subfield")
;

private final int id;
private final String name;

ValidationErrorCategory(int id, String name) {
this.id = id;
this.name = name;
}

public int getId() {
return id;
}

public String getName() {
return name;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ private static String formatTextWithoutId(ValidationError error) {
}

private static String[] headerForSummary() {
return new String[]{"id", "MarcPath", "typeId", "type", "message", "url", "instances", "records"};
return new String[]{"id", "MarcPath", "categoryId", "typeId", "type", "message", "url", "instances", "records"};
}

private static String[] headerForCollector() {
Expand All @@ -169,6 +169,7 @@ private static String[] headerForCollector() {
private static String[] asArrayWithoutId(ValidationError error) {
return new String[]{
error.getMarcPath(),
String.valueOf(error.getType().getCategory().getId()),
String.valueOf(error.getType().getId()),
error.getType().getMessage(),
error.getMessage(),
Expand All @@ -179,6 +180,7 @@ private static String[] asArrayWithoutId(ValidationError error) {
private static List<String> asListWithoutId(ValidationError error) {
return Arrays.asList(
error.getMarcPath(),
String.valueOf(error.getType().getCategory().getId()),
String.valueOf(error.getType().getId()),
error.getType().getMessage(),
error.getMessage(),
Expand All @@ -190,6 +192,7 @@ private static List<String> asList(ValidationError error) {
return Arrays.asList(
error.getRecordId(),
error.getMarcPath(),
String.valueOf(error.getType().getCategory().getId()),
String.valueOf(error.getType().getId()),
error.getType().getMessage(),
error.getMessage(),
Expand All @@ -201,6 +204,7 @@ private static String[] asArray(ValidationError error) {
return new String[]{
error.getRecordId(),
error.getMarcPath(),
String.valueOf(error.getType().getCategory().getId()),
String.valueOf(error.getType().getId()),
error.getType().getMessage(),
error.getMessage(),
Expand All @@ -209,6 +213,6 @@ private static String[] asArray(ValidationError error) {
}

private static String[] headerArray() {
return new String[]{"recordId", "MarcPath", "typeId", "type", "message", "url"};
return new String[]{"recordId", "MarcPath", "categoryId", "typeId", "type", "message", "url"};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,40 @@
public enum ValidationErrorType {

// record
RECORD_UNDETECTABLE_TYPE(1, "undetectableType", "record: undetectable type", "record"),
RECORD_INVALID_LINKAGE(2, "invalidLinkage", "record: invalid linkage", "record"),
RECORD_AMBIGUOUS_LINKAGE(3, "ambiguousLinkage", "record: ambiguous linkage", "record"),
RECORD_UNDETECTABLE_TYPE(1, "undetectableType", "undetectable type", ValidationErrorCategory.RECORD),
RECORD_INVALID_LINKAGE(2, "invalidLinkage", "invalid linkage", ValidationErrorCategory.RECORD),
RECORD_AMBIGUOUS_LINKAGE(3, "ambiguousLinkage", "ambiguous linkage", ValidationErrorCategory.RECORD),
// control subfield
CONTROL_SUBFIELD_OBSOLETE_CODE(4, "obsoleteControlSubfield", "control subfield: obsolete code", "control subfield"),
CONTROL_SUBFIELD_INVALID_CODE(5, "controlValueContainsInvalidCode", "control subfield: invalid code", "control subfield"),
CONTROL_SUBFIELD_INVALID_VALUE(6, "hasInvalidValue", "control subfield: invalid value", "control subfield"),
CONTROL_SUBFIELD_OBSOLETE_CODE(4, "obsoleteControlSubfield", "obsolete code", ValidationErrorCategory.CONTROLFIELD),
CONTROL_SUBFIELD_INVALID_CODE(5, "controlValueContainsInvalidCode", "invalid code", ValidationErrorCategory.CONTROLFIELD),
CONTROL_SUBFIELD_INVALID_VALUE(6, "hasInvalidValue", "invalid value", ValidationErrorCategory.CONTROLFIELD),
// field
FIELD_MISSING_REFERENCE_SUBFIELD(7, "missingSubfield", "field: missing reference subfield (880$6)", "field"),
FIELD_NONREPEATABLE(8, "nonrepeatableField", "field: repetition of non-repeatable field", "field"),
FIELD_UNDEFINED(9, "undefinedField", "field: undefined field", "field"),
FIELD_MISSING_REFERENCE_SUBFIELD(7, "missingSubfield", "missing reference subfield (880$6)", ValidationErrorCategory.DATAFIELD),
FIELD_NONREPEATABLE(8, "nonrepeatableField", "repetition of non-repeatable field", ValidationErrorCategory.DATAFIELD),
FIELD_UNDEFINED(9, "undefinedField", "undefined field", ValidationErrorCategory.DATAFIELD),
// indicator
INDICATOR_OBSOLETE(10, "obsoleteIndicator", "indicator: obsolete value", "indicator"),
INDICATOR_NON_EMPTY(11, "nonEmptyIndicator", "indicator: non-empty indicator", "indicator"),
INDICATOR_INVALID_VALUE(12, "hasInvalidValue", "indicator: invalid value", "indicator"),
INDICATOR_OBSOLETE(10, "obsoleteIndicator", "obsolete value", ValidationErrorCategory.INDICATOR),
INDICATOR_NON_EMPTY(11, "nonEmptyIndicator", "non-empty indicator", ValidationErrorCategory.INDICATOR),
INDICATOR_INVALID_VALUE(12, "hasInvalidValue", "invalid value", ValidationErrorCategory.INDICATOR),
// subfield
SUBFIELD_UNDEFINED(13, "undefinedSubfield", "subfield: undefined subfield", "subfield"),
SUBFIELD_INVALID_LENGTH(14, "invalidLength", "subfield: invalid length", "subfield"),
SUBFIELD_INVALID_CLASSIFICATION_REFERENCE(15, "invalidReference", "subfield: invalid classification reference", "subfield"),
SUBFIELD_PATTERN_MISMATCH(16, "patternMismatch", "subfield: content does not match any patterns", "subfield"),
SUBFIELD_NONREPEATABLE(17, "nonrepeatableSubfield", "subfield: repetition of non-repeatable subfield", "subfield"),
SUBFIELD_ISBN(18, "invalidISBN", "subfield: invalid ISBN", "subfield"),
SUBFIELD_ISSN(19, "invalidISSN", "subfield: invalid ISSN", "subfield"),
SUBFIELD_UNPARSABLE_CONTENT(20, "unparsableContent", "subfield: content is not well-formatted", "subfield"),
SUBFIELD_NULL_CODE(21, "nullCode", "subfield: null subfield code", "subfield"),
SUBFIELD_INVALID_VALUE(22, "hasInvalidValue", "subfield: invalid value", "subfield"),
SUBFIELD_UNDEFINED(13, "undefinedSubfield", "undefined subfield", ValidationErrorCategory.SUBFIELD),
SUBFIELD_INVALID_LENGTH(14, "invalidLength", "invalid length", ValidationErrorCategory.SUBFIELD),
SUBFIELD_INVALID_CLASSIFICATION_REFERENCE(15, "invalidReference", "invalid classification reference", ValidationErrorCategory.SUBFIELD),
SUBFIELD_PATTERN_MISMATCH(16, "patternMismatch", "content does not match any patterns", ValidationErrorCategory.SUBFIELD),
SUBFIELD_NONREPEATABLE(17, "nonrepeatableSubfield", "repetition of non-repeatable subfield", ValidationErrorCategory.SUBFIELD),
SUBFIELD_ISBN(18, "invalidISBN", "invalid ISBN", ValidationErrorCategory.SUBFIELD),
SUBFIELD_ISSN(19, "invalidISSN", "invalid ISSN", ValidationErrorCategory.SUBFIELD),
SUBFIELD_UNPARSABLE_CONTENT(20, "unparsableContent", "content is not well-formatted", ValidationErrorCategory.SUBFIELD),
SUBFIELD_NULL_CODE(21, "nullCode", "null subfield code", ValidationErrorCategory.SUBFIELD),
SUBFIELD_INVALID_VALUE(22, "hasInvalidValue", "invalid value", ValidationErrorCategory.SUBFIELD),
;

private final int id;
private final String code;
private final String message;
private final String category;
private final ValidationErrorCategory category;

ValidationErrorType(int id, String code, String message, String category) {
ValidationErrorType(int id, String code, String message, ValidationErrorCategory category) {
this.id = id;
this.code = code;
this.message = message;
Expand All @@ -55,7 +55,7 @@ public String getMessage() {
return message;
}

public String getCategory() {
public ValidationErrorCategory getCategory() {
return category;
}
}

0 comments on commit 82436c5

Please sign in to comment.