diff --git a/src/main/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtil.java b/src/main/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtil.java index f3b6faf9b..eb3622362 100644 --- a/src/main/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtil.java +++ b/src/main/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtil.java @@ -26,15 +26,14 @@ import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Collections; -import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Optional; -import java.util.Set; import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; + import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.tuple.Pair; @@ -80,6 +79,7 @@ public final class AdditionalFieldsUtil { private static final CacheLoader parsedRecordContentCacheLoader; private static final LoadingCache parsedRecordContentCache; private static final String OCLC = "OCoLC"; + private static final String OCLC_PREFIX = "(OCoLC)"; private static final ObjectMapper objectMapper = new ObjectMapper(); public static final String FIELDS = "fields"; private static final String OCLC_PATTERN = "\\((" + OCLC + ")\\)((ocm|ocn|on)?0*|([a-zA-Z]+)0*)(\\d+\\w*)"; @@ -272,24 +272,21 @@ public static void move001To035(Record srcRecord) { removeField(srcRecord, TAG_003); } - public static void normalize035(Record srsRecord) { - List subfields = get035SubfieldOclcValues(srsRecord, TAG_035, TAG_035_SUB); + public static void normalize035(Record srcRecord) { + List subfields = get035SubfieldOclcValues(srcRecord, TAG_035); if (!subfields.isEmpty()) { - Set normalized035Subfields = formatOclc(subfields); - - updateOclcSubfield(srsRecord, normalized035Subfields); + formatOclc(subfields); + deduplicateOclc(srcRecord, subfields, TAG_035); + recalculateLeaderAndParsedRecord(srcRecord); } } - private static Set formatOclc(List subFields) { - Set processedSet = new LinkedHashSet<>(); + private static void formatOclc(List subfields) { Pattern pattern = Pattern.compile(OCLC_PATTERN); - for (Subfield subfield : subFields) { + for (Subfield subfield : subfields) { String data = subfield.getData().replaceAll("[.\\s]", ""); - var code = subfield.getCode(); Matcher matcher = pattern.matcher(data); - if (matcher.find()) { String oclcTag = matcher.group(1); // "OCoLC" String numericAndTrailing = matcher.group(5); // Numeric part and any characters that follow @@ -297,7 +294,7 @@ private static Set formatOclc(List subFields) { if (prefix != null && (prefix.startsWith("ocm") || prefix.startsWith("ocn") || prefix.startsWith("on"))) { // If "ocm" or "ocn", strip entirely from the prefix - processedSet.add(code + "&(" + oclcTag + ")" + numericAndTrailing); + subfield.setData("(" + oclcTag + ")" + numericAndTrailing); } else { // For other cases, strip leading zeros only from the numeric part numericAndTrailing = numericAndTrailing.replaceFirst("^0+", ""); @@ -305,69 +302,77 @@ private static Set formatOclc(List subFields) { prefix = prefix.replaceAll("\\d+", ""); // Safely remove digits from the prefix if not null } // Add back any other prefix that might have been included like "tfe" - processedSet.add(code + "&(" + oclcTag + ")" + (prefix != null ? prefix : "") + numericAndTrailing); + subfield.setData("(" + oclcTag + ")" + (prefix != null ? prefix : "") + numericAndTrailing); } - } else { - // If it does not match, add the data as is - processedSet.add(code + "&" + data); } } - return processedSet; } - private static void updateOclcSubfield(Record recordForUpdate, - Set normalizedValues) { - try (ByteArrayOutputStream os = new ByteArrayOutputStream()) { - if (recordForUpdate != null && recordForUpdate.getParsedRecord() != null && recordForUpdate.getParsedRecord().getContent() != null) { - MarcWriter streamWriter = new MarcStreamWriter(new ByteArrayOutputStream()); - MarcJsonWriter jsonWriter = new MarcJsonWriter(os); - MarcFactory factory = MarcFactory.newInstance(); - org.marc4j.marc.Record marcRecord = computeMarcRecord(recordForUpdate); - if (marcRecord != null) { + private static void deduplicateOclc(Record srcRecord, List subfields, String tag) { + List subfieldsToDelete = new ArrayList<>(); - DataField dataField = factory.newDataField(TAG_035, TAG_035_IND, TAG_035_IND); - normalizedValues.forEach(value -> { - var v = value.split("&"); - dataField.addSubfield(factory.newSubfield(v[0].charAt(0), v[1])); - }); + for (Subfield subfield: new ArrayList<>(subfields)) { + if (subfields.stream().anyMatch(s -> isOclcSubfieldDuplicated(subfield, s))) { + subfieldsToDelete.add(subfield); + subfields.remove(subfield); + } + } + Optional.ofNullable(computeMarcRecord(srcRecord)).ifPresent(marcRecord -> { + List variableFields = marcRecord.getVariableFields(tag); - replaceOclc035FieldWithNormalizedData(marcRecord, dataField); + subfieldsToDelete.forEach(subfieldToDelete -> + variableFields.forEach(field -> removeSubfieldIfExist(marcRecord, field, subfieldToDelete))); + }); + } - // use stream writer to recalculate leader - streamWriter.write(marcRecord); - jsonWriter.write(marcRecord); + private static boolean isOclcSubfieldDuplicated(Subfield s1, Subfield s2) { + return s1 != s2 && s1.getData().equals(s2.getData()) && s1.getCode() == s2.getCode(); + } - String parsedContentString = new JsonObject(os.toString()).encode(); - // save parsed content string to cache then set it on the record - parsedRecordContentCache.put(parsedContentString, marcRecord); - recordForUpdate.setParsedRecord(recordForUpdate.getParsedRecord().withContent(parsedContentString)); - } + private static void removeSubfieldIfExist(org.marc4j.marc.Record marcRecord, VariableField field, Subfield subfieldToDelete) { + if (field instanceof DataField dataField && dataField.getSubfields().contains(subfieldToDelete)) { + if (dataField.getSubfields().size() > 1) { + dataField.removeSubfield(subfieldToDelete); + } else { + marcRecord.removeVariableField(dataField); + } + } + } + + private static void recalculateLeaderAndParsedRecord(Record recordForUpdate) { + try (ByteArrayOutputStream os = new ByteArrayOutputStream()) { + MarcWriter streamWriter = new MarcStreamWriter(new ByteArrayOutputStream()); + MarcJsonWriter jsonWriter = new MarcJsonWriter(os); + org.marc4j.marc.Record marcRecord = computeMarcRecord(recordForUpdate); + + if (marcRecord != null) { + // use stream writer to recalculate leader + streamWriter.write(marcRecord); + jsonWriter.write(marcRecord); + + String parsedContentString = new JsonObject(os.toString()).encode(); + // save parsed content string to cache then set it on the record + parsedRecordContentCache.put(parsedContentString, marcRecord); + recordForUpdate.setParsedRecord(recordForUpdate.getParsedRecord().withContent(parsedContentString)); } } catch (Exception e) { - LOGGER.warn("Failed to update OCLC subfield for record: {}", recordForUpdate.getId(), e); + LOGGER.warn("recalculateLeaderAndParsedRecord:: Failed to recalculate leader and parsed record for record: {}", recordForUpdate.getId(), e); } } - public static List get035SubfieldOclcValues(Record srcRecord, String tag, char subfield) { + public static List get035SubfieldOclcValues(Record srcRecord, String tag) { return Optional.ofNullable(computeMarcRecord(srcRecord)) .stream() .flatMap(marcRecord -> marcRecord.getVariableFields(tag).stream()) - .flatMap(field -> get035oclcSubfields(field, subfield).stream()) - .toList(); + .flatMap(field -> get035oclcSubfields(field).stream()) + .collect(Collectors.toList()); } - private static List get035oclcSubfields(VariableField field, char subfield) { + private static List get035oclcSubfields(VariableField field) { if (field instanceof DataField dataField) { - - Optional oclcSubfield = dataField.getSubfields(subfield).stream() - .filter(sf -> sf.find(OCLC)) - .findFirst(); - - if (oclcSubfield.isPresent()) { - return dataField.getSubfields(); - } else { - return Collections.emptyList(); - } + return dataField.getSubfields().stream() + .filter(sf -> sf.getData().startsWith(OCLC_PREFIX)) + .toList(); } return Collections.emptyList(); } @@ -579,21 +584,6 @@ private static boolean removeFirstFoundFieldByName(org.marc4j.marc.Record marcRe return isFieldFound; } - private static void replaceOclc035FieldWithNormalizedData(org.marc4j.marc.Record marcRecord, - DataField dataField) { - var variableFields = marcRecord.getVariableFields(TAG_035); - int[] index = {0}; - if (!variableFields.isEmpty()) { - variableFields.stream() - .filter(variableField -> variableField.find(OCLC)) - .forEach(variableField -> { - index[0] = (marcRecord.getDataFields().indexOf(variableField)); - marcRecord.removeVariableField(variableField); - }); - marcRecord.getDataFields().add(index[0], dataField); - } - } - /** * Checks if the field contains a certain value in the selected subfield * diff --git a/src/test/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtilTest.java b/src/test/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtilTest.java index a16a264dd..72376fc48 100644 --- a/src/test/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtilTest.java +++ b/src/test/java/org/folio/inventory/dataimport/util/AdditionalFieldsUtilTest.java @@ -3,7 +3,6 @@ import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_001; import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_005; import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_035; -import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_035_SUB; import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.TAG_999; import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.addControlledFieldToMarcRecord; import static org.folio.inventory.dataimport.util.AdditionalFieldsUtil.addDataFieldToMarcRecord; @@ -445,6 +444,35 @@ public static Collection data() { "{\"leader\":\"00098nam 22000611a 4500\",\"fields\":[{\"001\":\"in001\"}," + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)607TST001\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}" + }, + { + "{\"leader\":\"00120nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(ABC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn0001234\"}, {\"a\":\"(OCoLC)ocn1234\"}, {\"b\":\"(OCoLC)ocn1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocm1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn00098765\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}", + + "{\"leader\":\"00218nam 22001091a 4500\",\"fields\":[{\"001\":\"in001\"}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(ABC)ocn0001234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"b\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)98765\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}" + }, + { + "{\"leader\":\"00126nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)1234456\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}", + + "{\"leader\":\"00126nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)1234\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC-M)1234456\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}" } }); } @@ -466,13 +494,13 @@ public void shouldNormalizeOCoLCField035() { } @Test - public void shouldReturnSubfieldIfOclcExist() { + public void shouldReturnOnlyOclcSubfield() { // given String parsedContent = "{\"leader\":\"00120nam 22000731a 4500\",\"fields\":[{\"001\":\"in001\"}," + "{\"035\":{\"subfields\":[{\"a\":\"(ybp7406411)in001\"}," + "{\"a\":\"(OCoLC)64758\"} ],\"ind1\":\" \",\"ind2\":\" \"}}," + "{\"500\":{\"subfields\":[{\"a\":\"data\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}"; - var expectedSubfields = List.of("(ybp7406411)in001", "(OCoLC)64758"); + var expectedSubfields = List.of("(OCoLC)64758"); ParsedRecord parsedRecord = new ParsedRecord().withContent(parsedContent); @@ -483,7 +511,7 @@ public void shouldReturnSubfieldIfOclcExist() { .withExternalIdsHolder(new ExternalIdsHolder().withInstanceId("001").withInstanceHrid("in001")); // when - var subfields = get035SubfieldOclcValues(record, TAG_035, TAG_035_SUB).stream().map(Subfield::getData).toList(); + var subfields = get035SubfieldOclcValues(record, TAG_035).stream().map(Subfield::getData).toList(); // then Assert.assertEquals(expectedSubfields.size(), subfields.size()); Assert.assertEquals(expectedSubfields.get(0), subfields.get(0)); @@ -528,7 +556,7 @@ public void shouldPreserveOrderOf035FieldsAfterNormalization() { "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)00012345\"}],\"ind1\":\"\",\"ind2\":\"\"}}," + "{\"040\":{\"subfields\":[{\"a\":\"DLC\"},{\"b\":\"eng\"},{\"c\":\"O\"},{\"d\":\"O\"},{\"d\":\"DLC\"}],\"ind1\":\"\",\"ind2\":\"\"}}]}"; - var expectedParsedContent = "{\"leader\":\"00357cama 22001451a 4500\",\"fields\":[" + + var expectedParsedContent = "{\"leader\":\"00372cama 22001571a 4500\",\"fields\":[" + "{\"001\":\"10065352\"}," + "{\"005\":\"20220127143948.0\"}," + "{\"008\":\"761216s1853mauch0010eng\"}," + @@ -537,7 +565,8 @@ public void shouldPreserveOrderOf035FieldsAfterNormalization() { "{\"010\":{\"subfields\":[{\"a\":\"01012052\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + "{\"022\":{\"subfields\":[{\"a\":\"0022-0469\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + "{\"030\":{\"subfields\":[{\"a\":\"0030-0469\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + - "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)2628488\"},{\"a\":\"(OCoLC)12345\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)2628488\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + + "{\"035\":{\"subfields\":[{\"a\":\"(OCoLC)12345\"}],\"ind1\":\" \",\"ind2\":\" \"}}," + "{\"040\":{\"subfields\":[{\"a\":\"DLC\"},{\"b\":\"eng\"},{\"c\":\"O\"},{\"d\":\"O\"},{\"d\":\"DLC\"}],\"ind1\":\" \",\"ind2\":\" \"}}]}"; ParsedRecord parsedRecord = new ParsedRecord().withContent(parsedContent); @@ -567,7 +596,7 @@ public void shouldNotReturnSubfieldIfOclcNotExist() { .withExternalIdsHolder(new ExternalIdsHolder().withInstanceId("001").withInstanceHrid("in001")); // when - var subfields = get035SubfieldOclcValues(record, TAG_035, TAG_035_SUB).stream().map(Subfield::getData).toList(); + var subfields = get035SubfieldOclcValues(record, TAG_035).stream().map(Subfield::getData).toList(); // then Assert.assertEquals(0, subfields.size()); }