libris · kwahlin · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/librisworks/build.gradle b/librisworks/build.gradle
@@ -16,8 +16,8 @@ repositories {
 
 dependencies {
     implementation project(':whelktool')
+    implementation project(':whelk-core')
     compileOnly "org.codehaus.groovy:groovy:${groovyVersion}"
-    compileOnly project(':whelk-core')
     scriptsCompileOnly sourceSets.main.output
     scriptsCompileOnly project(':whelk-core')
     testImplementation "org.spockframework:spock-core:${spockVersion}"

diff --git a/librisworks/run.sh b/librisworks/run.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 set -eu
 
+# Find, match and merge work descriptions that describe the same work.
+# Usage example: ./run.sh qa --num-threads 8
+
 count_lines() {
   if [ -f $1 ]; then
     wc -l $1 | cut -d ' ' -f 1
@@ -9,7 +12,7 @@ count_lines() {
   fi
 }
 
-if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod)$ ]]; then
+if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod|edu)$ ]]; then
   echo "Missing or invalid environment"
   exit 1
 fi
@@ -24,6 +27,7 @@ WHELKTOOL_REPORT=whelktool-report
 CLUSTER_TSV=clusters.tsv
 
 SCRIPTS_DIR=scripts
+SVSK_DIR=$SCRIPTS_DIR/svsk
 REPORT_DIR=reports/merge-works/$ENV-$(date +%Y%m%d)
 
 CLUSTERS_DIR=$REPORT_DIR/clusters
@@ -47,15 +51,16 @@ ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance
 echo "Finding new clusters..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
   $ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null
+
+# Filter out duplicates
+sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV
+
 NUM_CLUSTERS=$(count_lines $ALL/$CLUSTER_TSV)
 echo "$NUM_CLUSTERS clusters found"
 if [ $NUM_CLUSTERS == 0 ]; then
   exit 0
 fi
 
-# Filter out duplicates
-sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV
-
 echo
 echo "Finding title clusters..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \
@@ -80,7 +85,7 @@ fi
 echo
 echo "Filtering on Swedish fiction..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
+  $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SVSK_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
 NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV)
 echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
 if [ $NUM_CLUSTERS == 0 ]; then
@@ -91,31 +96,31 @@ fi
 echo
 echo "Removing language from work titles..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $LANGUAGE_IN_TITLE $SCRIPTS_DIR/language-in-work-title.groovy 2>/dev/null
+  $ARGS --report $LANGUAGE_IN_TITLE $SVSK_DIR/language-in-work-title.groovy 2>/dev/null
 echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"
 
 echo
 echo "Merging contribution objects with same agent..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SCRIPTS_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null
+  $ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SVSK_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null
 echo "$(count_lines $DEDUPLICATE_CONTRIBUTIONS/MODIFIED.txt) records affected, report in $DEDUPLICATE_CONTRIBUTIONS"
 
 echo
 echo "Adding missing contribution data..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SCRIPTS_DIR/add-missing-contribution-data.groovy 2>/dev/null
+  $ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SVSK_DIR/add-missing-contribution-data.groovy 2>/dev/null
 echo "$(count_lines $ADD_MISSING_CONTRIBUTION_DATA/MODIFIED.txt) records affected, report in $ADD_MISSING_CONTRIBUTION_DATA"
 
 echo
 echo "Moving roles to instance..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $ROLES_TO_INSTANCE $SCRIPTS_DIR/contributions-to-instance.groovy 2>/dev/null
+  $ARGS --report $ROLES_TO_INSTANCE $SVSK_DIR/contributions-to-instance.groovy 2>/dev/null
 echo "$(count_lines $ROLES_TO_INSTANCE/MODIFIED.txt) records affected, report in $ROLES_TO_INSTANCE"
 
 # Filter: Drop anonymous translations
 echo "Filtering out anonymous translations..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SCRIPTS_DIR/drop-anonymous-translations.groovy \
+  $ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SVSK_DIR/drop-anonymous-translations.groovy \
   >$NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV 2>/dev/null
 NUM_CLUSTERS=$(count_lines $NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV)
 echo "$NUM_CLUSTERS clusters ready for merge"

diff --git a/librisworks/scripts/display-clusters.groovy → ...s/scripts/display/display-clusters.groovy b/librisworks/scripts/display-clusters.groovy → ...s/scripts/display/display-clusters.groovy
diff --git a/librisworks/scripts/display-works.groovy → ...orks/scripts/display/display-works.groovy b/librisworks/scripts/display-works.groovy → ...orks/scripts/display/display-works.groovy
diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy
@@ -1,7 +1,21 @@
 /**
+ * Find clusters of records that may contain descriptions of the same work.
+ * In short, similar descriptions are found by, for each bib record, querying Elastic for other records
+ * having the same instance or work title and the same agent(s) in work contribution.
+ * The ids found by the query become a cluster.
+ * See script for more details.
+ *
  * (When running, redirect STDERR to avoid annoying prints from whelktool)
  */
 
+import static se.kb.libris.mergeworks.Util.AGENT
+import static se.kb.libris.mergeworks.Util.HAS_TITLE
+import static se.kb.libris.mergeworks.Util.MAIN_TITLE
+import static se.kb.libris.mergeworks.Util.PRIMARY
+import static se.kb.libris.mergeworks.Util.CONTRIBUTION
+import static whelk.JsonLd.ID_KEY
+import static whelk.JsonLd.TYPE_KEY
+
 PrintWriter failedQueries = getReportWriter("failed-queries")
 PrintWriter tooLargeResult = getReportWriter("too-large-result")
 
@@ -12,6 +26,7 @@ def process = { bib ->
 
         if (!work) return
 
+        // Get mainTitle from both instance and work (we want to search for both when they differ)
         def titles = [instance, work].grep().collect { title(it) }.grep().unique()
 
         Set ids = []
@@ -27,6 +42,7 @@ def process = { bib ->
         if (ids.size() > 1000) {
             tooLargeResult.println("Results: ${ids.size()} Id: ${bib.doc.shortId} Titles: ${titles}")
         } else if (ids.size() > 1) {
+            // Sort so that duplicate clusters can easily be identified
             println(ids.sort().join('\t'))
         }
     }
@@ -51,13 +67,15 @@ Map<String, List<String>> buildQuery(Map work, String title) {
     insertLinkedAgents(work)
     def card = getWhelk().jsonld.toCard(work, false, true)
 
+    // If there is a primary contributor, include only that in agent in the query
     def author = primaryContributor(card).collect { esSafe(it) }
     if (author) {
         query["or-instanceOf.contribution._str"] = author
         query["or-instanceOf.contribution.agent._str"] = author
         return query
     }
 
+    // If no primary contributor, include all agents in the query
     def allContributors = contributors(card).collect { esSafe(it) }
     if (allContributors) {
         query["or-instanceOf.contribution._str"] = allContributors
@@ -69,29 +87,29 @@ Map<String, List<String>> buildQuery(Map work, String title) {
 }
 
 private void insertLinkedAgents(work) {
-    asList(work['contribution']).each {
-        def agent = asList(it.agent).find()
-        if (agent && agent['@id']) {
-            it.agent = loadThing(agent['@id'])
+    asList(work[CONTRIBUTION]).each {
+        def agent = asList(it[AGENT]).find()
+        if (agent && agent[ID_KEY]) {
+            it.agent = loadThing(agent[ID_KEY])
         }
     }
 }
 
 private String title(Map thing) {
-    return getAtPath(thing, ['hasTitle', 0, 'mainTitle'])
+    return getAtPath(thing, [HAS_TITLE, 0, MAIN_TITLE])
 }
 
 private List primaryContributor(work) {
-    contributorStrings(asList(work['contribution']).find { it['@type'] == "PrimaryContribution" })
+    contributorStrings(asList(work[CONTRIBUTION]).find { it[TYPE_KEY] == PRIMARY })
 }
 
 private List contributors(work) {
-    asList(work['contribution']).collect { contributorStrings(it) }.grep().flatten()
+    asList(work[CONTRIBUTION]).collect { contributorStrings(it) }.grep().flatten()
 }
 
 //getAtPath(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') }
 private List contributorStrings(contribution) {
-    List variants = asList(contribution?.agent) + asList(getAtPath(contribution, ['agent', 'hasVariant']))
+    List variants = asList(contribution?[AGENT]) + asList(getAtPath(contribution, [AGENT, 'hasVariant']))
 
     variants.grep().collect { name(it) }.grep()
 }
@@ -108,7 +126,7 @@ private String esSafe(String s) {
 }
 
 private loadIfLink(Map work) {
-    work?['@id'] ? loadThing(work['@id']) : work
+    work?[ID_KEY] ? loadThing(work[ID_KEY]) : work
 }
 
 private Map loadThing(def id) {

diff --git a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy b/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy
diff --git a/librisworks/scripts/merge-works.groovy b/librisworks/scripts/merge-works.groovy
@@ -1,8 +1,27 @@
+/**
+ * Match and merge works.
+ *
+ * First create clusters of works that are considered equal according to given criteria.
+ * If a work cluster contains only local works (two or more), merge those and create a new linkable work.
+ * If a work cluster contains exactly one linked work and at least one local work, merge the local work(s) into the linked one.
+ * If a work cluster contains two or more linked works, report. There should be no duplicate linked works.
+ *
+ * If multiple work clusters are found, add closeMatch links from each unique work to each resulting linked work.
+ *
+ * See script for details.
+ */
+
 import se.kb.libris.mergeworks.Html
 import se.kb.libris.mergeworks.WorkComparator
 import se.kb.libris.mergeworks.Doc
 
+import static whelk.JsonLd.GRAPH_KEY
+import static whelk.JsonLd.ID_KEY
+
 import static se.kb.libris.mergeworks.Util.workClusters
+import static whelk.JsonLd.THING_KEY
+import static whelk.JsonLd.TYPE_KEY
+import static whelk.JsonLd.WORK_KEY
 
 maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv")
 multiWorkReport = getReportWriter("multi-work-clusters.html")
@@ -34,6 +53,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
     List<Tuple2<Doc, Collection<Doc>>> uniqueWorksAndTheirInstances = []
 
     workClusters(docs, c).each { wc ->
+        // Only local works have instance data in the same record
         def (localWorks, linkedWorks) = wc.split { it.instanceData }
         if (linkedWorks.isEmpty()) {
             if (localWorks.size() == 1) {
@@ -64,7 +84,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
         }
         // New merged work
         if (!workDoc.existsInStorage && !workDoc.instanceData) {
-            addAdminMetadata(workDoc, instanceDocs.collect { ['@id': it.recordIri()] })
+            addAdminMetadata(workDoc, instanceDocs.collect { [(ID_KEY): it.recordIri()] })
             addCloseMatch(workDoc, linkableWorks)
             saveAndLink(workDoc, instanceDocs, workDoc.existsInStorage)
 //            writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.NEW)
@@ -76,6 +96,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
         }
     }
 
+    // Multiple unique works in same title cluster, save report showing how they differ.
     if (uniqueWorksAndTheirInstances.size() > 1) {
         def (workDocs, instanceDocs) = uniqueWorksAndTheirInstances.transpose()
         multiWorkReport.print(Html.hubTable(workDocs, instanceDocs) + Html.HORIZONTAL_RULE)
@@ -101,20 +122,20 @@ void saveAndLink(Doc workDoc, Collection<Doc> instanceDocs = [], boolean existsI
 
     if (!instanceDocs.isEmpty()) {
         selectByIds(instanceDocs.collect { it.shortId() }) {
-            it.graph[1]['instanceOf'] = ['@id': workDoc.thingIri()]
+            it.graph[1][WORK_KEY] = [(ID_KEY): workDoc.thingIri()]
             it.scheduleSave(changedBy: changedBy, generationProcess: generationProcess)
         }
     }
 }
 
 Doc createNewWork(Map workData) {
-    workData['@id'] = "TEMPID#it"
+    workData[ID_KEY] = "TEMPID#it"
     Map data = [
-            "@graph": [
+            (GRAPH_KEY): [
                     [
-                            "@id"       : "TEMPID",
-                            "@type"     : "Record",
-                            "mainEntity": ["@id": "TEMPID#it"],
+                            (ID_KEY)   : "TEMPID",
+                            (TYPE_KEY) : "Record",
+                            (THING_KEY): [(ID_KEY): "TEMPID#it"],
 
                     ],
                     workData
@@ -127,12 +148,12 @@ Doc createNewWork(Map workData) {
 void addAdminMetadata(Doc doc, List<Map> derivedFrom) {
     doc.record()['hasChangeNote'] = [
             [
-                    '@type': 'CreateNote',
+                    (ID_KEY): 'CreateNote',
                     'tool' : ['@id': 'https://id.kb.se/generator/mergeworks']
             ]
     ]
     doc.record()['derivedFrom'] = derivedFrom
-    doc.record()['descriptionLanguage'] = ['@id': 'https://id.kb.se/language/swe']
+    doc.record()['descriptionLanguage'] = [(ID_KEY): 'https://id.kb.se/language/swe']
 }
 
 void writeWorkReport(Collection<Doc> titleCluster, Doc derivedWork, Collection<Doc> derivedFrom, WorkStatus workStatus) {
@@ -175,7 +196,7 @@ boolean addCloseMatch(Doc workDoc, List<Doc> linkableWorks) {
     def linkTo = linkableWorks.findAll { d ->
         d.workIri() != workDoc.thingIri()
                 && d.primaryContributor() == workDoc.primaryContributor()
-    }.collect { ['@id': it.workIri()] }
+    }.collect { [(ID_KEY): it.workIri()] }
 
     def closeMatch = asList(workDoc.workData['closeMatch'])
 

diff --git a/...ipts/add-missing-contribution-data.groovy → ...svsk/add-missing-contribution-data.groovy b/...ipts/add-missing-contribution-data.groovy → ...svsk/add-missing-contribution-data.groovy
@@ -1,3 +1,8 @@
+/**
+ * Use various methods for completing and normalizing contributions within a work cluster.
+ * See individual methods for details.
+ */
+
 import groovy.transform.Memoized
 import org.apache.commons.lang3.StringUtils
 
@@ -132,6 +137,7 @@ selectByIds(clusters.flatten()) { bib ->
         modified |= tryLinkAgent(c, id)
         // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
         modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
+        // if two local agents match on name and one of them has lifeSpan and the other doesn't, add that lifeSpan to the one missing it.
         modified |= tryAddLifeSpanToLocalAgent(c, id)
     }
 

diff --git a/.../scripts/contributions-to-instance.groovy → ...pts/svsk/contributions-to-instance.groovy b/.../scripts/contributions-to-instance.groovy → ...pts/svsk/contributions-to-instance.groovy
@@ -1,3 +1,12 @@
+/**
+ * Move contribution to instance if the role's domain is (or is subclass of) Embodiment.
+ * Also move illustrator to instance if none of the following criteria is met:
+ *  - The illustrator is the primary contributor (PrimaryContribution)
+ *  - Classification indicates a picture book or comics
+ *  - Genre/form indicates a picture book or comics
+ * See isComics() and isPictureBook() below for details.
+ */
+
 import whelk.Whelk
 
 import java.util.concurrent.ConcurrentHashMap

diff --git a/...cripts/drop-anonymous-translations.groovy → ...s/svsk/drop-anonymous-translations.groovy b/...cripts/drop-anonymous-translations.groovy → ...s/svsk/drop-anonymous-translations.groovy
@@ -1,3 +1,7 @@
+/**
+ * Drop works that are translations but lacking a translator in contribution from clusters.
+ */
+
 import se.kb.libris.mergeworks.Doc
 
 new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->

diff --git a/...rks/scripts/language-in-work-title.groovy → ...cripts/svsk/language-in-work-title.groovy b/...rks/scripts/language-in-work-title.groovy → ...cripts/svsk/language-in-work-title.groovy
@@ -1,3 +1,8 @@
+/**
+ * Remove language appearing as a substring of work main title if already described in the language property.
+ * E.g. "Pippi Långstrump (Svenska)" --> "Pippi Långstrump" when work.language = [{'@id': 'https://id.kb.se/language/swe'}]
+ */
+
 import groovy.transform.Memoized
 import whelk.util.DocumentUtil