Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/librisworks finish #1343

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion librisworks/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ repositories {

dependencies {
implementation project(':whelktool')
implementation project(':whelk-core')
compileOnly "org.codehaus.groovy:groovy:${groovyVersion}"
compileOnly project(':whelk-core')
scriptsCompileOnly sourceSets.main.output
scriptsCompileOnly project(':whelk-core')
testImplementation "org.spockframework:spock-core:${spockVersion}"
Expand Down
25 changes: 15 additions & 10 deletions librisworks/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
set -eu

# Find, match and merge work descriptions that describe the same work.
# Usage example: ./run.sh qa --num-threads 8

count_lines() {
if [ -f $1 ]; then
wc -l $1 | cut -d ' ' -f 1
Expand All @@ -9,7 +12,7 @@ count_lines() {
fi
}

if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod)$ ]]; then
if ! [[ "$1" =~ ^(local|dev|dev2|qa|stg|prod|edu)$ ]]; then
echo "Missing or invalid environment"
exit 1
fi
Expand All @@ -24,6 +27,7 @@ WHELKTOOL_REPORT=whelktool-report
CLUSTER_TSV=clusters.tsv

SCRIPTS_DIR=scripts
SVSK_DIR=$SCRIPTS_DIR/svsk
REPORT_DIR=reports/merge-works/$ENV-$(date +%Y%m%d)

CLUSTERS_DIR=$REPORT_DIR/clusters
Expand All @@ -47,15 +51,16 @@ ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance
echo "Finding new clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
$ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null

# Filter out duplicates
sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV

NUM_CLUSTERS=$(count_lines $ALL/$CLUSTER_TSV)
echo "$NUM_CLUSTERS clusters found"
if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Filter out duplicates
sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV

echo
echo "Finding title clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \
Expand All @@ -80,7 +85,7 @@ fi
echo
echo "Filtering on Swedish fiction..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
$ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SVSK_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV)
echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
if [ $NUM_CLUSTERS == 0 ]; then
Expand All @@ -91,31 +96,31 @@ fi
echo
echo "Removing language from work titles..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $LANGUAGE_IN_TITLE $SCRIPTS_DIR/language-in-work-title.groovy 2>/dev/null
$ARGS --report $LANGUAGE_IN_TITLE $SVSK_DIR/language-in-work-title.groovy 2>/dev/null
echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"

echo
echo "Merging contribution objects with same agent..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SCRIPTS_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null
$ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SVSK_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null
echo "$(count_lines $DEDUPLICATE_CONTRIBUTIONS/MODIFIED.txt) records affected, report in $DEDUPLICATE_CONTRIBUTIONS"

echo
echo "Adding missing contribution data..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SCRIPTS_DIR/add-missing-contribution-data.groovy 2>/dev/null
$ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SVSK_DIR/add-missing-contribution-data.groovy 2>/dev/null
echo "$(count_lines $ADD_MISSING_CONTRIBUTION_DATA/MODIFIED.txt) records affected, report in $ADD_MISSING_CONTRIBUTION_DATA"

echo
echo "Moving roles to instance..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $ROLES_TO_INSTANCE $SCRIPTS_DIR/contributions-to-instance.groovy 2>/dev/null
$ARGS --report $ROLES_TO_INSTANCE $SVSK_DIR/contributions-to-instance.groovy 2>/dev/null
echo "$(count_lines $ROLES_TO_INSTANCE/MODIFIED.txt) records affected, report in $ROLES_TO_INSTANCE"

# Filter: Drop anonymous translations
echo "Filtering out anonymous translations..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SCRIPTS_DIR/drop-anonymous-translations.groovy \
$ARGS --report $NO_ANONYMOUS_TRANSLATIONS/$WHELKTOOL_REPORT $SVSK_DIR/drop-anonymous-translations.groovy \
>$NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $NO_ANONYMOUS_TRANSLATIONS/$CLUSTER_TSV)
echo "$NUM_CLUSTERS clusters ready for merge"
Expand Down
36 changes: 27 additions & 9 deletions librisworks/scripts/find-work-clusters.groovy
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
/**
* Find clusters of records that may contain descriptions of the same work.
* In short, similar descriptions are found by, for each bib record, querying Elastic for other records
* having the same instance or work title and the same agent(s) in work contribution.
* The ids found by the query become a cluster.
* See script for more details.
*
* (When running, redirect STDERR to avoid annoying prints from whelktool)
*/

import static se.kb.libris.mergeworks.Util.AGENT
import static se.kb.libris.mergeworks.Util.HAS_TITLE
import static se.kb.libris.mergeworks.Util.MAIN_TITLE
import static se.kb.libris.mergeworks.Util.PRIMARY
import static se.kb.libris.mergeworks.Util.CONTRIBUTION
import static whelk.JsonLd.ID_KEY
import static whelk.JsonLd.TYPE_KEY

PrintWriter failedQueries = getReportWriter("failed-queries")
PrintWriter tooLargeResult = getReportWriter("too-large-result")

Expand All @@ -12,6 +26,7 @@ def process = { bib ->

if (!work) return

// Get mainTitle from both instance and work (we want to search for both when they differ)
def titles = [instance, work].grep().collect { title(it) }.grep().unique()

Set ids = []
Expand All @@ -27,6 +42,7 @@ def process = { bib ->
if (ids.size() > 1000) {
tooLargeResult.println("Results: ${ids.size()} Id: ${bib.doc.shortId} Titles: ${titles}")
} else if (ids.size() > 1) {
// Sort so that duplicate clusters can easily be identified
println(ids.sort().join('\t'))
}
}
Expand All @@ -51,13 +67,15 @@ Map<String, List<String>> buildQuery(Map work, String title) {
insertLinkedAgents(work)
def card = getWhelk().jsonld.toCard(work, false, true)

// If there is a primary contributor, include only that in agent in the query
def author = primaryContributor(card).collect { esSafe(it) }
if (author) {
query["or-instanceOf.contribution._str"] = author
query["or-instanceOf.contribution.agent._str"] = author
return query
}

// If no primary contributor, include all agents in the query
def allContributors = contributors(card).collect { esSafe(it) }
if (allContributors) {
query["or-instanceOf.contribution._str"] = allContributors
Expand All @@ -69,29 +87,29 @@ Map<String, List<String>> buildQuery(Map work, String title) {
}

private void insertLinkedAgents(work) {
asList(work['contribution']).each {
def agent = asList(it.agent).find()
if (agent && agent['@id']) {
it.agent = loadThing(agent['@id'])
asList(work[CONTRIBUTION]).each {
def agent = asList(it[AGENT]).find()
if (agent && agent[ID_KEY]) {
it.agent = loadThing(agent[ID_KEY])
}
}
}

private String title(Map thing) {
return getAtPath(thing, ['hasTitle', 0, 'mainTitle'])
return getAtPath(thing, [HAS_TITLE, 0, MAIN_TITLE])
}

private List primaryContributor(work) {
contributorStrings(asList(work['contribution']).find { it['@type'] == "PrimaryContribution" })
contributorStrings(asList(work[CONTRIBUTION]).find { it[TYPE_KEY] == PRIMARY })
}

private List contributors(work) {
asList(work['contribution']).collect { contributorStrings(it) }.grep().flatten()
asList(work[CONTRIBUTION]).collect { contributorStrings(it) }.grep().flatten()
}

//getAtPath(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') }
private List contributorStrings(contribution) {
List variants = asList(contribution?.agent) + asList(getAtPath(contribution, ['agent', 'hasVariant']))
List variants = asList(contribution?[AGENT]) + asList(getAtPath(contribution, [AGENT, 'hasVariant']))

variants.grep().collect { name(it) }.grep()
}
Expand All @@ -108,7 +126,7 @@ private String esSafe(String s) {
}

private loadIfLink(Map work) {
work?['@id'] ? loadThing(work['@id']) : work
work?[ID_KEY] ? loadThing(work[ID_KEY]) : work
}

private Map loadThing(def id) {
Expand Down
32 changes: 0 additions & 32 deletions librisworks/scripts/lxl-4150-deduplicate-contribution.groovy

This file was deleted.

41 changes: 31 additions & 10 deletions librisworks/scripts/merge-works.groovy
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
/**
* Match and merge works.
*
* First create clusters of works that are considered equal according to given criteria.
* If a work cluster contains only local works (two or more), merge those and create a new linkable work.
* If a work cluster contains exactly one linked work and at least one local work, merge the local work(s) into the linked one.
* If a work cluster contains two or more linked works, report. There should be no duplicate linked works.
*
* If multiple work clusters are found, add closeMatch links from each unique work to each resulting linked work.
*
* See script for details.
*/

import se.kb.libris.mergeworks.Html
import se.kb.libris.mergeworks.WorkComparator
import se.kb.libris.mergeworks.Doc

import static whelk.JsonLd.GRAPH_KEY
import static whelk.JsonLd.ID_KEY

import static se.kb.libris.mergeworks.Util.workClusters
import static whelk.JsonLd.THING_KEY
import static whelk.JsonLd.TYPE_KEY
import static whelk.JsonLd.WORK_KEY

maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv")
multiWorkReport = getReportWriter("multi-work-clusters.html")
Expand Down Expand Up @@ -34,6 +53,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
List<Tuple2<Doc, Collection<Doc>>> uniqueWorksAndTheirInstances = []

workClusters(docs, c).each { wc ->
// Only local works have instance data in the same record
def (localWorks, linkedWorks) = wc.split { it.instanceData }
if (linkedWorks.isEmpty()) {
if (localWorks.size() == 1) {
Expand Down Expand Up @@ -64,7 +84,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
}
// New merged work
if (!workDoc.existsInStorage && !workDoc.instanceData) {
addAdminMetadata(workDoc, instanceDocs.collect { ['@id': it.recordIri()] })
addAdminMetadata(workDoc, instanceDocs.collect { [(ID_KEY): it.recordIri()] })
addCloseMatch(workDoc, linkableWorks)
saveAndLink(workDoc, instanceDocs, workDoc.existsInStorage)
// writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.NEW)
Expand All @@ -76,6 +96,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
}
}

// Multiple unique works in same title cluster, save report showing how they differ.
if (uniqueWorksAndTheirInstances.size() > 1) {
def (workDocs, instanceDocs) = uniqueWorksAndTheirInstances.transpose()
multiWorkReport.print(Html.hubTable(workDocs, instanceDocs) + Html.HORIZONTAL_RULE)
Expand All @@ -101,20 +122,20 @@ void saveAndLink(Doc workDoc, Collection<Doc> instanceDocs = [], boolean existsI

if (!instanceDocs.isEmpty()) {
selectByIds(instanceDocs.collect { it.shortId() }) {
it.graph[1]['instanceOf'] = ['@id': workDoc.thingIri()]
it.graph[1][WORK_KEY] = [(ID_KEY): workDoc.thingIri()]
it.scheduleSave(changedBy: changedBy, generationProcess: generationProcess)
}
}
}

Doc createNewWork(Map workData) {
workData['@id'] = "TEMPID#it"
workData[ID_KEY] = "TEMPID#it"
Map data = [
"@graph": [
(GRAPH_KEY): [
[
"@id" : "TEMPID",
"@type" : "Record",
"mainEntity": ["@id": "TEMPID#it"],
(ID_KEY) : "TEMPID",
(TYPE_KEY) : "Record",
(THING_KEY): [(ID_KEY): "TEMPID#it"],

],
workData
Expand All @@ -127,12 +148,12 @@ Doc createNewWork(Map workData) {
void addAdminMetadata(Doc doc, List<Map> derivedFrom) {
doc.record()['hasChangeNote'] = [
[
'@type': 'CreateNote',
(ID_KEY): 'CreateNote',
'tool' : ['@id': 'https://id.kb.se/generator/mergeworks']
]
]
doc.record()['derivedFrom'] = derivedFrom
doc.record()['descriptionLanguage'] = ['@id': 'https://id.kb.se/language/swe']
doc.record()['descriptionLanguage'] = [(ID_KEY): 'https://id.kb.se/language/swe']
}

void writeWorkReport(Collection<Doc> titleCluster, Doc derivedWork, Collection<Doc> derivedFrom, WorkStatus workStatus) {
Expand Down Expand Up @@ -175,7 +196,7 @@ boolean addCloseMatch(Doc workDoc, List<Doc> linkableWorks) {
def linkTo = linkableWorks.findAll { d ->
d.workIri() != workDoc.thingIri()
&& d.primaryContributor() == workDoc.primaryContributor()
}.collect { ['@id': it.workIri()] }
}.collect { [(ID_KEY): it.workIri()] }

def closeMatch = asList(workDoc.workData['closeMatch'])

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/**
* Use various methods for completing and normalizing contributions within a work cluster.
* See individual methods for details.
*/

import groovy.transform.Memoized
import org.apache.commons.lang3.StringUtils

Expand Down Expand Up @@ -132,6 +137,7 @@ selectByIds(clusters.flatten()) { bib ->
modified |= tryLinkAgent(c, id)
// if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
// if two local agents match on name and one of them has lifeSpan and the other doesn't, add that lifeSpan to the one missing it.
modified |= tryAddLifeSpanToLocalAgent(c, id)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
/**
* Move contribution to instance if the role's domain is (or is subclass of) Embodiment.
* Also move illustrator to instance if none of the following criteria is met:
* - The illustrator is the primary contributor (PrimaryContribution)
* - Classification indicates a picture book or comics
* - Genre/form indicates a picture book or comics
* See isComics() and isPictureBook() below for details.
*/

import whelk.Whelk

import java.util.concurrent.ConcurrentHashMap
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
/**
* Drop works that are translations but lacking a translator in contribution from clusters.
*/

import se.kb.libris.mergeworks.Doc

new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/**
* Remove language appearing as a substring of work main title if already described in the language property.
* E.g. "Pippi Långstrump (Svenska)" --> "Pippi Långstrump" when work.language = [{'@id': 'https://id.kb.se/language/swe'}]
*/

import groovy.transform.Memoized
import whelk.util.DocumentUtil

Expand Down
Loading
Loading