Skip to content

Commit

Permalink
issue #154: subject indexing analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Aug 16, 2022
1 parent fbc1126 commit d8a0bfe
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes;

import java.util.ArrayList;
Expand Down Expand Up @@ -71,14 +72,21 @@ public class ClassificationAnalyzer {
"653" // Index Term - Uncontrolled
);

private static final List<FieldWithScheme> fieldsWithScheme = Arrays.asList(
private static final List<FieldWithScheme> MARC21_FIELD_WITH_SCHEMES = Arrays.asList(
new FieldWithScheme("080", "Universal Decimal Classification"),
new FieldWithScheme("082", "Dewey Decimal Classification"),
new FieldWithScheme("083", "Dewey Decimal Classification"),
new FieldWithScheme("085", "Dewey Decimal Classification")
// new FieldWithScheme("086", "Government Document Classification");
);

private static final List<FieldWithScheme> PICA_FIELDS_WITH_SCHEME = Arrays.asList(
new FieldWithScheme("045A", "Library of Congress Classification"),
new FieldWithScheme("045B", "Other scheme"),
new FieldWithScheme("045F", "Dewey Decimal Classification"),
new FieldWithScheme("045R", "Regensburger Verbundklassifikation")
);

public ClassificationAnalyzer(MarcRecord marcRecord, ClassificationStatistics statistics) {
this.marcRecord = marcRecord;
this.statistics = statistics;
Expand All @@ -88,11 +96,15 @@ public int process() {
var total = 0;
schemasInRecord = new ArrayList<>();

total = processFieldsWithIndicator1AndSubfield2(total);
total = processFieldsWithIndicator2AndSubfield2(total);
total = processFieldsWithSubfield2(total);
total = processFieldsWithoutSource(total);
total = processFieldsWithScheme(total);
if (marcRecord.getSchemaType().equals(SchemaType.MARC21)) {
total = processFieldsWithIndicator1AndSubfield2(total);
total = processFieldsWithIndicator2AndSubfield2(total);
total = processFieldsWithSubfield2(total);
total = processFieldsWithoutSource(total);
total = processFieldsWithScheme(total, MARC21_FIELD_WITH_SCHEMES);
} else if (marcRecord.getSchemaType().equals(SchemaType.PICA)) {
total = processFieldsWithScheme(total, PICA_FIELDS_WITH_SCHEME);
}

increaseCounters(total);

Expand All @@ -114,7 +126,7 @@ private void increaseCounters(int total) {
count(collocation, statistics.getCollocationHistogram());
}

private int processFieldsWithScheme(int total) {
private int processFieldsWithScheme(int total, List<FieldWithScheme> fieldsWithScheme) {
for (FieldWithScheme fieldWithScheme : fieldsWithScheme) {
var count = processFieldWithScheme(marcRecord, fieldWithScheme);
if (count > 0)
Expand Down
21 changes: 16 additions & 5 deletions src/main/java/de/gwdg/metadataqa/marc/dao/MarcRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ public class MarcRecord implements Extractable, Validatable, Serializable {
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static final Pattern positionalPattern = Pattern.compile("^(Leader|00[678])/(.*)$");
private static final List<String> simpleControlTags = Arrays.asList("001", "003", "005");
private static final List<String> MARC21_SUBJECT_TAGS = Arrays.asList(
"052", "055", "072", "080", "082", "083", "084", "085", "086",
"600", "610", "611", "630", "647", "648", "650", "651",
"653", "654", "655", "656", "657", "658", "662"
);
public static final List<String> PICA_SUBJECT_TAGS = Arrays.asList("045A", "045B", "045F", "045R");

private static final Map<String, Boolean> undefinedTags = new HashMap<>();

private Leader leader;
Expand Down Expand Up @@ -745,11 +752,15 @@ public List<DataField> getAuthorityFields() {

public List<DataField> getSubjects() {
List<DataField> subjects = new ArrayList<>();
List<String> tags = Arrays.asList(
"052", "055", "072", "080", "082", "083", "084", "085", "086",
"600", "610", "611", "630", "647", "648", "650", "651",
"653", "654", "655", "656", "657", "658", "662"
);
List<String> tags;
switch (schemaType) {
case PICA:
tags = PICA_SUBJECT_TAGS; break;
case MARC21:
default:
tags = MARC21_SUBJECT_TAGS; break;
}
logger.info("tags: " + tags);
for (String tag : tags) {
List<DataField> fields = getDatafield(tag);
if (fields != null && !fields.isEmpty())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ private void initialize() {
schemes.put("Chronological term", "chronological");
schemes.put("Geographic name", "geographic");
schemes.put("Genre/form term", "genre");

// PICA
schemes.put("Regensburger Verbundklassifikation", "rvk");
}

public String resolve(String key) {
Expand Down

0 comments on commit d8a0bfe

Please sign in to comment.