Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MODINV-1092] Ensure that OCLC 035 entries display on separate lines of underlying MARC #775

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,14 @@
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -80,6 +79,7 @@ public final class AdditionalFieldsUtil {
private static final CacheLoader<String, org.marc4j.marc.Record> parsedRecordContentCacheLoader;
private static final LoadingCache<String, org.marc4j.marc.Record> parsedRecordContentCache;
private static final String OCLC = "OCoLC";
private static final String OCLC_PREFIX = "(OCoLC)";
private static final ObjectMapper objectMapper = new ObjectMapper();
public static final String FIELDS = "fields";
private static final String OCLC_PATTERN = "\\((" + OCLC + ")\\)((ocm|ocn|on)?0*|([a-zA-Z]+)0*)(\\d+\\w*)";
Expand Down Expand Up @@ -272,102 +272,107 @@ public static void move001To035(Record srcRecord) {
removeField(srcRecord, TAG_003);
}

public static void normalize035(Record srsRecord) {
List<Subfield> subfields = get035SubfieldOclcValues(srsRecord, TAG_035, TAG_035_SUB);
public static void normalize035(Record srcRecord) {
List<Subfield> subfields = get035SubfieldOclcValues(srcRecord, TAG_035);
if (!subfields.isEmpty()) {
Set<String> normalized035Subfields = formatOclc(subfields);

updateOclcSubfield(srsRecord, normalized035Subfields);
formatOclc(subfields);
deduplicateOclc(srcRecord, subfields, TAG_035);
recalculateLeaderAndParsedRecord(srcRecord);
}
}

private static Set<String> formatOclc(List<Subfield> subFields) {
Set<String> processedSet = new LinkedHashSet<>();
private static void formatOclc(List<Subfield> subfields) {
Pattern pattern = Pattern.compile(OCLC_PATTERN);

for (Subfield subfield : subFields) {
for (Subfield subfield : subfields) {
String data = subfield.getData().replaceAll("[.\\s]", "");
var code = subfield.getCode();
Matcher matcher = pattern.matcher(data);

if (matcher.find()) {
String oclcTag = matcher.group(1); // "OCoLC"
String numericAndTrailing = matcher.group(5); // Numeric part and any characters that follow
String prefix = matcher.group(2); // Entire prefix including letters and potentially leading zeros

if (prefix != null && (prefix.startsWith("ocm") || prefix.startsWith("ocn") || prefix.startsWith("on"))) {
// If "ocm" or "ocn", strip entirely from the prefix
processedSet.add(code + "&(" + oclcTag + ")" + numericAndTrailing);
subfield.setData("(" + oclcTag + ")" + numericAndTrailing);
} else {
// For other cases, strip leading zeros only from the numeric part
numericAndTrailing = numericAndTrailing.replaceFirst("^0+", "");
if (prefix != null) {
prefix = prefix.replaceAll("\\d+", ""); // Safely remove digits from the prefix if not null
}
// Add back any other prefix that might have been included like "tfe"
processedSet.add(code + "&(" + oclcTag + ")" + (prefix != null ? prefix : "") + numericAndTrailing);
subfield.setData("(" + oclcTag + ")" + (prefix != null ? prefix : "") + numericAndTrailing);
}
} else {
// If it does not match, add the data as is
processedSet.add(code + "&" + data);
}
}
return processedSet;
}

private static void updateOclcSubfield(Record recordForUpdate,
Set<String> normalizedValues) {
try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
if (recordForUpdate != null && recordForUpdate.getParsedRecord() != null && recordForUpdate.getParsedRecord().getContent() != null) {
MarcWriter streamWriter = new MarcStreamWriter(new ByteArrayOutputStream());
MarcJsonWriter jsonWriter = new MarcJsonWriter(os);
MarcFactory factory = MarcFactory.newInstance();
org.marc4j.marc.Record marcRecord = computeMarcRecord(recordForUpdate);
if (marcRecord != null) {
private static void deduplicateOclc(Record srcRecord, List<Subfield> subfields, String tag) {
List<Subfield> subfieldsToDelete = new ArrayList<>();

DataField dataField = factory.newDataField(TAG_035, TAG_035_IND, TAG_035_IND);
normalizedValues.forEach(value -> {
var v = value.split("&");
dataField.addSubfield(factory.newSubfield(v[0].charAt(0), v[1]));
});
for (Subfield subfield: new ArrayList<>(subfields)) {
if (subfields.stream().anyMatch(s -> isOclcSubfieldDuplicated(subfield, s))) {
subfieldsToDelete.add(subfield);
subfields.remove(subfield);
}
}
Optional.ofNullable(computeMarcRecord(srcRecord)).ifPresent(marcRecord -> {
List<VariableField> variableFields = marcRecord.getVariableFields(tag);

replaceOclc035FieldWithNormalizedData(marcRecord, dataField);
subfieldsToDelete.forEach(subfieldToDelete ->
variableFields.forEach(field -> removeSubfieldIfExist(marcRecord, field, subfieldToDelete)));
});
}

// use stream writer to recalculate leader
streamWriter.write(marcRecord);
jsonWriter.write(marcRecord);
private static boolean isOclcSubfieldDuplicated(Subfield s1, Subfield s2) {
return s1 != s2 && s1.getData().equals(s2.getData()) && s1.getCode() == s2.getCode();
}

String parsedContentString = new JsonObject(os.toString()).encode();
// save parsed content string to cache then set it on the record
parsedRecordContentCache.put(parsedContentString, marcRecord);
recordForUpdate.setParsedRecord(recordForUpdate.getParsedRecord().withContent(parsedContentString));
}
private static void removeSubfieldIfExist(org.marc4j.marc.Record marcRecord, VariableField field, Subfield subfieldToDelete) {
if (field instanceof DataField dataField && dataField.getSubfields().contains(subfieldToDelete)) {
if (dataField.getSubfields().size() > 1) {
dataField.removeSubfield(subfieldToDelete);
} else {
marcRecord.removeVariableField(dataField);
}
}
}

private static void recalculateLeaderAndParsedRecord(Record recordForUpdate) {
try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
MarcWriter streamWriter = new MarcStreamWriter(new ByteArrayOutputStream());
MarcJsonWriter jsonWriter = new MarcJsonWriter(os);
org.marc4j.marc.Record marcRecord = computeMarcRecord(recordForUpdate);

if (marcRecord != null) {
// use stream writer to recalculate leader
streamWriter.write(marcRecord);
jsonWriter.write(marcRecord);

String parsedContentString = new JsonObject(os.toString()).encode();
// save parsed content string to cache then set it on the record
parsedRecordContentCache.put(parsedContentString, marcRecord);
recordForUpdate.setParsedRecord(recordForUpdate.getParsedRecord().withContent(parsedContentString));
}
} catch (Exception e) {
LOGGER.warn("Failed to update OCLC subfield for record: {}", recordForUpdate.getId(), e);
LOGGER.warn("recalculateLeaderAndParsedRecord:: Failed to recalculate leader and parsed record for record: {}", recordForUpdate.getId(), e);
}
}

public static List<Subfield> get035SubfieldOclcValues(Record srcRecord, String tag, char subfield) {
public static List<Subfield> get035SubfieldOclcValues(Record srcRecord, String tag) {
return Optional.ofNullable(computeMarcRecord(srcRecord))
.stream()
.flatMap(marcRecord -> marcRecord.getVariableFields(tag).stream())
.flatMap(field -> get035oclcSubfields(field, subfield).stream())
.toList();
.flatMap(field -> get035oclcSubfields(field).stream())
.collect(Collectors.toList());
}

private static List<Subfield> get035oclcSubfields(VariableField field, char subfield) {
private static List<Subfield> get035oclcSubfields(VariableField field) {
if (field instanceof DataField dataField) {

Optional<Subfield> oclcSubfield = dataField.getSubfields(subfield).stream()
.filter(sf -> sf.find(OCLC))
.findFirst();

if (oclcSubfield.isPresent()) {
return dataField.getSubfields();
} else {
return Collections.emptyList();
}
return dataField.getSubfields().stream()
.filter(sf -> sf.getData().startsWith(OCLC_PREFIX))
.toList();
}
return Collections.emptyList();
}
Expand Down Expand Up @@ -579,21 +584,6 @@ private static boolean removeFirstFoundFieldByName(org.marc4j.marc.Record marcRe
return isFieldFound;
}

private static void replaceOclc035FieldWithNormalizedData(org.marc4j.marc.Record marcRecord,
DataField dataField) {
var variableFields = marcRecord.getVariableFields(TAG_035);
int[] index = {0};
if (!variableFields.isEmpty()) {
variableFields.stream()
.filter(variableField -> variableField.find(OCLC))
.forEach(variableField -> {
index[0] = (marcRecord.getDataFields().indexOf(variableField));
marcRecord.removeVariableField(variableField);
});
marcRecord.getDataFields().add(index[0], dataField);
}
}

/**
* Checks if the field contains a certain value in the selected subfield
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_001;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_005;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_035;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_035_SUB;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_999;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.addControlledFieldToMarcRecord;
import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.addDataFieldToMarcRecord;
Expand Down Expand Up @@ -445,6 +444,35 @@ public static Collection<Object[]> data() {
"{\"leader\":\"00098nam 22000611a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)607TST001\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}"
},
{
"{\"leader\":\"00120nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(ABC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn0001234\"}, {\"a\":\"(OCoLC)ocn1234\"}, {\"b\":\"(OCoLC)ocn1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocm1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn00098765\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}",

"{\"leader\":\"00218nam 22001091a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(ABC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"b\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)98765\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}"
},
{
"{\"leader\":\"00126nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)1234456\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}",

"{\"leader\":\"00126nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)1234456\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}"
}
});
}
Expand All @@ -466,13 +494,13 @@ public void shouldNormalizeOCoLCField035() {
}

@Test
public void shouldReturnSubfieldIfOclcExist() {
public void shouldReturnOnlyOclcSubfield() {
// given
String parsedContent = "{\"leader\":\"00120nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," +
"{\"035\":{\"subfields\":[{\"a\":\"(ybp7406411)in001\"}," +
"{\"a\":\"(OCoLC)64758\"} ],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}";
var expectedSubfields = List.of("(ybp7406411)in001", "(OCoLC)64758");
var expectedSubfields = List.of("(OCoLC)64758");

ParsedRecord parsedRecord = new ParsedRecord().withContent(parsedContent);

Expand All @@ -483,7 +511,7 @@ public void shouldReturnSubfieldIfOclcExist() {
.withExternalIdsHolder(new ExternalIdsHolder().withInstanceId("001").withInstanceHrid("in001"));

// when
var subfields = get035SubfieldOclcValues(record, TAG_035, TAG_035_SUB).stream().map(Subfield::getData).toList();
var subfields = get035SubfieldOclcValues(record, TAG_035).stream().map(Subfield::getData).toList();
// then
Assert.assertEquals(expectedSubfields.size(), subfields.size());
Assert.assertEquals(expectedSubfields.get(0), subfields.get(0));
Expand Down Expand Up @@ -528,7 +556,7 @@ public void shouldPreserveOrderOf035FieldsAfterNormalization() {
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)00012345\"}],\"ind1\":\"\",\"ind2\":\"\"}}," +
"{\"040\":{\"subfields\":[{\"a\":\"DLC\"},{\"b\":\"eng\"},{\"c\":\"O\"},{\"d\":\"O\"},{\"d\":\"DLC\"}],\"ind1\":\"\",\"ind2\":\"\"}}]}";

var expectedParsedContent = "{\"leader\":\"00357cama 22001451a 4500\",\"fields\":[" +
var expectedParsedContent = "{\"leader\":\"00372cama 22001571a 4500\",\"fields\":[" +
"{\"001\":\"10065352\"}," +
"{\"005\":\"20220127143948.0\"}," +
"{\"008\":\"761216s1853mauch0010eng\"}," +
Expand All @@ -537,7 +565,8 @@ public void shouldPreserveOrderOf035FieldsAfterNormalization() {
"{\"010\":{\"subfields\":[{\"a\":\"01012052\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"022\":{\"subfields\":[{\"a\":\"0022-0469\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"030\":{\"subfields\":[{\"a\":\"0030-0469\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)2628488\"},{\"a\":\"(OCoLC)12345\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)2628488\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)12345\"}],\"ind1\":\" \",\"ind2\":\" \"}}," +
"{\"040\":{\"subfields\":[{\"a\":\"DLC\"},{\"b\":\"eng\"},{\"c\":\"O\"},{\"d\":\"O\"},{\"d\":\"DLC\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}";

ParsedRecord parsedRecord = new ParsedRecord().withContent(parsedContent);
Expand Down Expand Up @@ -567,7 +596,7 @@ public void shouldNotReturnSubfieldIfOclcNotExist() {
.withExternalIdsHolder(new ExternalIdsHolder().withInstanceId("001").withInstanceHrid("in001"));

// when
var subfields = get035SubfieldOclcValues(record, TAG_035, TAG_035_SUB).stream().map(Subfield::getData).toList();
var subfields = get035SubfieldOclcValues(record, TAG_035).stream().map(Subfield::getData).toList();
// then
Assert.assertEquals(0, subfields.size());
}
Expand Down
Loading