From e9bf15effbd1ea8d985b10aa46934d0f03303d98 Mon Sep 17 00:00:00 2001 From: Ludovic DANIEL Date: Wed, 3 Apr 2024 14:24:25 +0200 Subject: [PATCH 1/3] 9276 - CVOC : allow customized mapping of indexed fields of cvoc configuration + handle ontoportal json formats from externalvocabularyvalue --- doc/release-notes/9276-doc-cvoc-index-in.md | 8 ++ .../iq/dataverse/DatasetFieldServiceBean.java | 93 ++++++++++++------- .../iq/dataverse/search/IndexServiceBean.java | 40 +++++++- 3 files changed, 107 insertions(+), 34 deletions(-) create mode 100644 doc/release-notes/9276-doc-cvoc-index-in.md diff --git a/doc/release-notes/9276-doc-cvoc-index-in.md b/doc/release-notes/9276-doc-cvoc-index-in.md new file mode 100644 index 00000000000..5c4dd4ca10f --- /dev/null +++ b/doc/release-notes/9276-doc-cvoc-index-in.md @@ -0,0 +1,8 @@ +## Release Highlights + +### Updates on Support for External Vocabulary Services + +#### Indexed field accuracy + +For more relevant indexing, you can now map external vocabulary values to a `managed-fields` of a [:CVocConf setting](https://guides.dataverse.org/en/6.3/installation/config.html#cvocconf) by adding the key `indexIn` in `retrieval-filtering`. +For more information, please check [GDCC/dataverse-external-vocab-support documentation](https://github.com/gdcc/dataverse-external-vocab-support/tree/main/docs). \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java index 6223cd83773..b1717431e41 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java @@ -41,6 +41,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.httpclient.HttpException; +import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.HttpResponseInterceptor; import org.apache.http.client.methods.HttpGet; @@ -321,14 +322,15 @@ public Map getCVocConf(boolean byTermUriField){ + jo.getString("term-uri-field")); } } - if (jo.containsKey("child-fields")) { - JsonArray childFields = jo.getJsonArray("child-fields"); - for (JsonString elm : childFields.getValuesAs(JsonString.class)) { - dft = findByNameOpt(elm.getString()); - logger.info("Found: " + dft.getName()); + if (jo.containsKey("managed-fields")) { + JsonObject managedFields = jo.getJsonObject("managed-fields"); + for (String s : managedFields.keySet()) { + dft = findByNameOpt(managedFields.getString(s)); if (dft == null) { logger.warning("Ignoring External Vocabulary setting for non-existent child field: " - + elm.getString()); + + managedFields.getString(s)); + } else { + logger.info("Found: " + dft.getName()); } } } @@ -345,7 +347,7 @@ public Map getCVocConf(boolean byTermUriField){ * @param df - the primitive/parent compound field containing a newly saved value */ public void registerExternalVocabValues(DatasetField df) { - DatasetFieldType dft =df.getDatasetFieldType(); + DatasetFieldType dft = df.getDatasetFieldType(); logger.fine("Registering for field: " + dft.getName()); JsonObject cvocEntry = getCVocConf(true).get(dft.getId()); if (dft.isPrimitive()) { @@ -366,38 +368,48 @@ public void registerExternalVocabValues(DatasetField df) { } } } - + /** - * Retrieves indexable strings from a cached externalvocabularyvalue entry. - * - * This method assumes externalvocabularyvalue entries have been filtered and - * the externalvocabularyvalue entry contain a single JsonObject whose "personName" or "termName" values - * are either Strings or an array of objects with "lang" and ("value" or "content") keys. The - * string, or the "value/content"s for each language are added to the set. - * + * Retrieves indexable strings from a cached externalvocabularyvalue entry filtered through retrieval-filtering configuration. + *

+ * This method externalvocabularyvalue entries have been filtered and contains a single JsonObject. + * Is handled : Strings, Array of Objects with "lang" and ("value" or "content") keys, Object with Strings as value or Object with Array of Strings as value. + * The string, or the "value/content"s for each language are added to the set. + * This method can retrieve string values to be indexed in term-uri-field (parameter defined in CVOC configuration) or in "indexIn" field (optional parameter of retrieval-filtering defined in CVOC configuration). + *

* Any parsing error results in no entries (there can be unfiltered entries with * unknown structure - getting some strings from such an entry could give fairly * random info that would be bad to addd for searches, etc.) - * - * @param termUri + * + * @param termUri unique identifier to search in database + * @param cvocEntry related cvoc configuration + * @param indexingField name of solr field that will be filled with getStringsFor while indexing * @return - a set of indexable strings */ - public Set getStringsFor(String termUri) { - Set strings = new HashSet(); + public Set getIndexableStringsByTermUri(String termUri, JsonObject cvocEntry, String indexingField) { + Set strings = new HashSet<>(); JsonObject jo = getExternalVocabularyValue(termUri); + JsonObject filtering = cvocEntry.getJsonObject("retrieval-filtering"); + String termUriField = cvocEntry.getJsonString("term-uri-field").getString(); if (jo != null) { try { for (String key : jo.keySet()) { - if (key.equals("termName") || key.equals("personName")) { + String indexIn = filtering.getJsonObject(key).getString("indexIn", null); + // Either we are in mapping mode so indexingField (solr field) equals indexIn (cvoc config) + // Or we are in default mode indexingField is termUriField, indexIn is not defined then only termName and personName keys are used + if (indexingField.equals(indexIn) || + (indexIn == null && termUriField.equals(indexingField) && (key.equals("termName")) || key.equals("personName"))) { JsonValue jv = jo.get(key); if (jv.getValueType().equals(JsonValue.ValueType.STRING)) { logger.fine("adding " + jo.getString(key) + " for " + termUri); strings.add(jo.getString(key)); - } else { - if (jv.getValueType().equals(JsonValue.ValueType.ARRAY)) { - JsonArray jarr = jv.asJsonArray(); - for (int i = 0; i < jarr.size(); i++) { + } else if (jv.getValueType().equals(JsonValue.ValueType.ARRAY)) { + JsonArray jarr = jv.asJsonArray(); + for (int i = 0; i < jarr.size(); i++) { + if (jarr.get(i).getValueType().equals(JsonValue.ValueType.STRING)) { + strings.add(jarr.getString(i)); + } else if (jarr.get(i).getValueType().equals(ValueType.OBJECT)) { // This condition handles SKOMOS format like [{"lang": "en","value": "non-apis bee"},{"lang": "fr","value": "abeille non apis"}] JsonObject entry = jarr.getJsonObject(i); if (entry.containsKey("value")) { logger.fine("adding " + entry.getString("value") + " for " + termUri); @@ -409,6 +421,22 @@ public Set getStringsFor(String termUri) { } } } + } else if (jv.getValueType().equals(JsonValue.ValueType.OBJECT)) { + JsonObject joo = jv.asJsonObject(); + for (Map.Entry entry : joo.entrySet()) { + if (entry.getValue().getValueType().equals(JsonValue.ValueType.STRING)) { // This condition handles format like { "fr": "association de quartier", "en": "neighborhood associations"} + logger.fine("adding " + joo.getString(entry.getKey()) + " for " + termUri); + strings.add(joo.getString(entry.getKey())); + } else if (entry.getValue().getValueType().equals(ValueType.ARRAY)) { // This condition handles format like {"en": ["neighbourhood societies"]} + JsonArray jarr = entry.getValue().asJsonArray(); + for (int i = 0; i < jarr.size(); i++) { + if (jarr.get(i).getValueType().equals(JsonValue.ValueType.STRING)) { + logger.fine("adding " + jarr.getString(i) + " for " + termUri); + strings.add(jarr.getString(i)); + } + } + } + } } } } @@ -420,7 +448,7 @@ public Set getStringsFor(String termUri) { } logger.fine("Returning " + String.join(",", strings) + " for " + termUri); return strings; - } + } /** * Perform a query to retrieve a cached value from the externalvocabularvalue table @@ -454,10 +482,11 @@ public JsonObject getExternalVocabularyValue(String termUri) { public void registerExternalTerm(JsonObject cvocEntry, String term) { String retrievalUri = cvocEntry.getString("retrieval-uri"); String prefix = cvocEntry.getString("prefix", null); - if(term.isBlank()) { - logger.fine("Ingoring blank term"); + if(StringUtils.isBlank(term)) { + logger.fine("Ignoring blank term"); return; } + boolean isExternal = false; JsonObject vocabs = cvocEntry.getJsonObject("vocabs"); for (String key: vocabs.keySet()) { @@ -512,7 +541,7 @@ public void process(HttpResponse response, HttpContext context) throws HttpExcep if (statusCode == 200) { logger.fine("Returned data: " + data); try (JsonReader jsonReader = Json.createReader(new StringReader(data))) { - String dataObj =filterResponse(cvocEntry, jsonReader.readObject(), term).toString(); + String dataObj = filterResponse(cvocEntry, jsonReader.readObject(), term).toString(); evv.setValue(dataObj); evv.setLastUpdateDate(Timestamp.from(Instant.now())); logger.fine("JsonObject: " + dataObj); @@ -543,7 +572,7 @@ public void process(HttpResponse response, HttpContext context) throws HttpExcep * Parse the raw value returned by an external service for a give term uri and * filter it according to the 'retrieval-filtering' configuration for this * DatasetFieldType, creating a Json value with the specified structure - * + * * @param cvocEntry - the config for this DatasetFieldType * @param readObject - the raw response from the service * @param termUri - the term uri @@ -602,6 +631,8 @@ private JsonObject filterResponse(JsonObject cvocEntry, JsonObject readObject, S if (pattern.equals("{0}")) { if (vals.get(0) instanceof JsonArray) { job.add(filterKey, (JsonArray) vals.get(0)); + } else if (vals.get(0) instanceof JsonObject) { + job.add(filterKey, (JsonObject) vals.get(0)); } else { job.add(filterKey, (String) vals.get(0)); } @@ -639,7 +670,7 @@ Object processPathSegment(int index, String[] pathParts, JsonValue curPath, Stri String[] keyVal = pathParts[index].split("="); logger.fine("Looking for object where " + keyVal[0] + " is " + keyVal[1]); String expected = keyVal[1]; - + if (!expected.equals("*")) { if (expected.equals("@id")) { expected = termUri; @@ -668,7 +699,7 @@ Object processPathSegment(int index, String[] pathParts, JsonValue curPath, Stri } return parts.build(); } - + } else { curPath = ((JsonObject) curPath).get(pathParts[index]); logger.fine("Found next Path object " + curPath.toString()); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index d6b3fd8c339..cf8a37e0a80 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -922,6 +922,20 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set langs = settingsService.getConfiguredLanguages(); Map cvocMap = datasetFieldService.getCVocConf(true); + Map> cvocManagedFieldMap = new HashMap<>(); + for (Map.Entry cvocEntry : cvocMap.entrySet()) { + if(cvocEntry.getValue().containsKey("managed-fields")) { + JsonObject managedFields = cvocEntry.getValue().getJsonObject("managed-fields"); + Set managedFieldValues = new HashSet<>(); + for (String s : managedFields.keySet()) { + managedFieldValues.add(managedFields.getString(s)); + } + cvocManagedFieldMap.put(cvocEntry.getKey(), managedFieldValues); + } + } + + + Set metadataBlocksWithValue = new HashSet<>(); for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) { @@ -996,19 +1010,39 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set vals = dsf.getValues_nondisplay(); - Set searchStrings = new HashSet(); + Set searchStrings = new HashSet<>(); for (String val: vals) { searchStrings.add(val); - searchStrings.addAll(datasetFieldService.getStringsFor(val)); + // Try to get string values from externalvocabularyvalue using val as termUri + searchStrings.addAll(datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), dsfType.getName())); + + if(dsfType.getParentDatasetFieldType()!=null) { + List childDatasetFields = dsf.getParentDatasetFieldCompoundValue().getChildDatasetFields(); + for (DatasetField df : childDatasetFields) { + if(cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) { + String solrManagedFieldSearchable = df.getDatasetFieldType().getSolrField().getNameSearchable(); + // Try to get string values from externalvocabularyvalue but for a managed fields of the CVOCConf + Set stringsForManagedField = datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), df.getDatasetFieldType().getName()); + logger.fine(solrManagedFieldSearchable + " filled with externalvocabularyvalue : " + stringsForManagedField); + //.addField works as addition of value not a replace of value + // it allows to add mapped values by CVOCConf before or after indexing real DatasetField value(s) of solrManagedFieldSearchable + solrInputDocument.addField(solrManagedFieldSearchable, stringsForManagedField); + } + } + } } + logger.fine(solrFieldSearchable + " filled with externalvocabularyvalue : " + searchStrings); solrInputDocument.addField(solrFieldSearchable, searchStrings); if (dsfType.getSolrField().isFacetable()) { + logger.fine(solrFieldFacetable + " gets " + vals); solrInputDocument.addField(solrFieldFacetable, vals); } } + if (dsfType.isControlledVocabulary()) { /** If the cvv list is empty but the dfv list is not then it is assumed this was harvested * from an installation that had controlled vocabulary entries that don't exist in our this db From 6b2e111142a39083db578d429394375dc061ce60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20ROUCOU?= Date: Tue, 23 Apr 2024 12:08:42 +0200 Subject: [PATCH 2/3] Add unit tests for "getIndexableStringsByTermUri" method --- .../DatasetFieldServiceBeanTest.java | 179 ++++++++++++++++++ src/test/resources/json/cvoc-agroportal.json | 76 ++++++++ src/test/resources/json/cvoc-orcid.json | 43 +++++ src/test/resources/json/cvoc-skosmos.json | 69 +++++++ 4 files changed, 367 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/DatasetFieldServiceBeanTest.java create mode 100644 src/test/resources/json/cvoc-agroportal.json create mode 100644 src/test/resources/json/cvoc-orcid.json create mode 100644 src/test/resources/json/cvoc-skosmos.json diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldServiceBeanTest.java new file mode 100644 index 00000000000..873d417131d --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldServiceBeanTest.java @@ -0,0 +1,179 @@ +package edu.harvard.iq.dataverse; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.Set; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.AdditionalMatchers; +import org.mockito.Mockito; + +import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import jakarta.json.Json; +import jakarta.json.JsonObject; + +public class DatasetFieldServiceBeanTest { + + private DatasetFieldServiceBean datasetFieldServiceBean; + + static String getCvocJson(String pathToJsonFile) throws IOException { + final File datasetVersionJson = new File(pathToJsonFile); + return new String(Files.readAllBytes(Paths.get(datasetVersionJson.getAbsolutePath()))); + } + + @BeforeEach + void setUp() { + this.datasetFieldServiceBean = Mockito.spy(new DatasetFieldServiceBean()); + } + + @AfterEach + void tearDown() { + this.datasetFieldServiceBean = null; + } + + @Test + void getIndexableStringsByTermUriSkosmos() throws IOException { + String fieldName = "keyword"; + String termURI = "http://aims.fao.org/aos/agrovoc/c_2389"; + + JsonObject cvocEntry = prepare(fieldName, "src/test/resources/json/cvoc-skosmos.json"); + + JsonObject getExtVocabValueReturnedValue = Json.createObjectBuilder() + .add("@id", termURI) + .add("termName", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("lang", "fr") + .add("value", "faux bourdon")) + .add(Json.createObjectBuilder() + .add("lang", "en") + .add("value", "drone (insects)"))) + .add("vocabularyUri", "http://aims.fao.org/aos/agrovoc") + .add("synonyms", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("lang", "fr") + .add("value", "Abeille mâle")) + .add(Json.createObjectBuilder() + .add("lang", "en") + .add("value", "drone honey bees"))) + .add("genericTerm", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("lang", "fr") + .add("value", "Colonie d'abeilles")) + .add(Json.createObjectBuilder() + .add("lang", "en") + .add("value", "bee colonies"))) + .build(); + Mockito.doReturn(getExtVocabValueReturnedValue).when(datasetFieldServiceBean).getExternalVocabularyValue(termURI); + Mockito.doReturn(null).when(datasetFieldServiceBean).getExternalVocabularyValue(AdditionalMatchers.not(Mockito.eq(termURI))); + + // keywordTermURL + Set result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, "keywordTermURL"); + assertEquals(Set.of("faux bourdon", "drone (insects)"), result); + + // keywordValue + result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, "keywordValue"); + assertEquals(Collections.emptySet(), result, "Only 'keywordTermURL' must return values for Skosmos"); + + // Any others field + result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, ""); + assertEquals(Collections.emptySet(), result, "Only 'keywordTermURL' must return values for Skosmos"); + + // Another termURI not in database + result = datasetFieldServiceBean.getIndexableStringsByTermUri("http://example.org/uuid", cvocEntry, "keywordTermURL"); + assertEquals(Collections.emptySet(), result); + } + + @Test + void getIndexableStringsByTermUriAgroportal() throws IOException { + String fieldName = "keyword"; + String termURI = "http://aims.fao.org/aos/agrovoc/c_50265"; + + JsonObject cvocEntry = prepare(fieldName, "src/test/resources/json/cvoc-agroportal.json"); + + JsonObject getExtVocabValueReturnedValue = Json.createObjectBuilder() + .add("@id", termURI) + .add("termName", Json.createObjectBuilder() + .add("fr", "association de quartier") + .add("en", "neighborhood associations")) + .add("vocabularyName", "https://data.agroportal.lirmm.fr/ontologies/AGROVOC") + .add("vocabularyUri", "https://data.agroportal.lirmm.fr/ontologies/AGROVOC") + .add("synonyms", Json.createObjectBuilder() + .add("en", Json.createArrayBuilder().add("neighborhood societies"))) + .build(); + Mockito.doReturn(getExtVocabValueReturnedValue).when(datasetFieldServiceBean).getExternalVocabularyValue(termURI); + Mockito.doReturn(null).when(datasetFieldServiceBean).getExternalVocabularyValue(AdditionalMatchers.not(Mockito.eq(termURI))); + + // keywordValue + Set result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, "keywordValue"); + assertEquals(Set.of("association de quartier", "neighborhood associations", "neighborhood societies"), result); + + // keywordTermURL + result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, "keywordTermURL"); + assertEquals(Collections.emptySet(), result, "Only 'keywordValue' must return values for Agroportal"); + + // Any others field + result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, ""); + assertEquals(Collections.emptySet(), result, "Only 'keywordValue' must return values for Agroportal"); + + // Another termURI not in database + result = datasetFieldServiceBean.getIndexableStringsByTermUri("http://example.org/uuid", cvocEntry, "keywordValue"); + assertEquals(Collections.emptySet(), result); + } + + @Test + void getIndexableStringsByTermUriOrcid() throws IOException { + String fieldName = "creator"; + String termURI = "https://orcid.org/0000-0003-4217-153X"; + + JsonObject cvocEntry = prepare(fieldName, "src/test/resources/json/cvoc-orcid.json"); + + JsonObject getExtVocabValueReturnedValue = Json.createObjectBuilder() + .add("@id", termURI) + .add("scheme", "ORCID") + .add("@type", "https://schema.org/Person") + .add("personName", "Doe, John") + .build(); + Mockito.doReturn(getExtVocabValueReturnedValue).when(datasetFieldServiceBean).getExternalVocabularyValue(termURI); + Mockito.doReturn(null).when(datasetFieldServiceBean).getExternalVocabularyValue(AdditionalMatchers.not(Mockito.eq(termURI))); + + // ORCID match with "personName" field into "getIndexableStringsByTermUri" method + Set result = datasetFieldServiceBean.getIndexableStringsByTermUri(termURI, cvocEntry, ""); + assertEquals(Set.of("Doe, John"), result); + + // Another termURI not in database + result = datasetFieldServiceBean.getIndexableStringsByTermUri("http://example.org/uuid", cvocEntry, fieldName); + assertEquals(Collections.emptySet(), result); + } + + /** + * Prepare unit tests with mock methods. + * + * @param fieldName "field-name" into cvoc configuration file + * @param jsonPath path of the JSON configuration file: src/test/resources/json/... + * @return {@link JsonObject} representing the configuration file + * @throws IOException in case on read error on configuration file + */ + JsonObject prepare(String fieldName, String jsonPath) throws IOException { + Long dftId = Long.parseLong("1"); + // DatasetFieldType name corresponding to "field-name" into cvoc configuration file + DatasetFieldType dft = new DatasetFieldType(fieldName, DatasetFieldType.FieldType.NONE, true); + dft.setId(dftId); + + Mockito.doReturn(dft).when(datasetFieldServiceBean).findByNameOpt(fieldName); + Mockito.doReturn(null).when(datasetFieldServiceBean).findByNameOpt(AdditionalMatchers.not(Mockito.eq(fieldName))); + + SettingsServiceBean settingsService = Mockito.mock(SettingsServiceBean.class); + Mockito.when(settingsService.getValueForKey(SettingsServiceBean.Key.CVocConf)).thenReturn(getCvocJson(jsonPath)); + datasetFieldServiceBean.settingsService = settingsService; + + return datasetFieldServiceBean.getCVocConf(false).get(dftId); + } + +} diff --git a/src/test/resources/json/cvoc-agroportal.json b/src/test/resources/json/cvoc-agroportal.json new file mode 100644 index 00000000000..03c9e2f4d07 --- /dev/null +++ b/src/test/resources/json/cvoc-agroportal.json @@ -0,0 +1,76 @@ +[ + { + "field-name": "keyword", + "term-uri-field": "keywordTermURL", + "cvoc-url": "https://data.agroportal.lirmm.fr/", + "js-url": "https://domain.tld/assets/cvoc/ontoportal.js", + "headers": { + "Authorization": "apikey token=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" + }, + "protocol": "ontoportal", + "retrieval-uri": "https://data.agroportal.lirmm.fr/ontologies/{keywordVocabulary}/classes/{encodeUrl:keywordTermURL}?language=en,fr", + "term-parent-uri": "", + "allow-free-text": true, + "languages": "en, fr", + "vocabs": { + "AGROVOC": { + "vocabularyUri": "https://data.agroportal.lirmm.fr/ontologies/AGROVOC", + "uriSpace": "http" + }, + "ONTOBIOTOPE": { + "vocabularyUri": "https://data.agroportal.lirmm.fr/ontologies/ONTOBIOTOPE", + "uriSpace": "http" + }, + "CROPUSAGE": { + "vocabularyUri": "https://data.agroportal.lirmm.fr/ontologies/CROPUSAGE", + "uriSpace": "http" + } + }, + "managed-fields": { + "vocabularyName": "keywordVocabulary", + "termName": "keywordValue", + "vocabularyUri": "keywordVocabularyURI" + }, + "retrieval-filtering": { + "@context": { + "termName": "https://schema.org/name", + "vocabularyName": "https://dataverse.org/schema/vocabularyName", + "vocabularyUri": "https://dataverse.org/schema/vocabularyUri", + "lang": "@language", + "value": "@value" + }, + "@id": { + "pattern": "{0}", + "params": [ + "@id" + ] + }, + "termName": { + "pattern": "{0}", + "params": [ + "/prefLabel" + ], + "indexIn": "keywordValue" + }, + "vocabularyName": { + "pattern": "{0}", + "params": [ + "/links/ontology" + ] + }, + "vocabularyUri": { + "pattern": "{0}", + "params": [ + "/links/ontology" + ] + }, + "synonyms": { + "pattern": "{0}", + "params": [ + "/synonym" + ], + "indexIn": "keywordValue" + } + } + } +] diff --git a/src/test/resources/json/cvoc-orcid.json b/src/test/resources/json/cvoc-orcid.json new file mode 100644 index 00000000000..6b904aefc3f --- /dev/null +++ b/src/test/resources/json/cvoc-orcid.json @@ -0,0 +1,43 @@ +[ + { + "field-name": "creator", + "term-uri-field": "creator", + "js-url": "https://gdcc.github.io/dataverse-external-vocab-support/scripts/people.js", + "protocol": "orcid", + "retrieval-uri": "https://pub.orcid.org/v3.0/{0}/person", + "allow-free-text": true, + "prefix": "https://orcid.org/", + "managed-fields": {}, + "languages": "", + "vocabs": { + "orcid": { + "uriSpace": "https://orcid.org/" + } + }, + "retrieval-filtering": { + "@context": { + "personName": "https://schema.org/name", + "scheme": "http://www.w3.org/2004/02/skos/core#inScheme" + }, + "personName": { + "pattern": "{0}, {1}", + "params": [ + "/name/family-name/value", + "/name/given-names/value" + ] + }, + "@id": { + "pattern": "{0}", + "params": [ + "@id" + ] + }, + "scheme": { + "pattern": "ORCID" + }, + "@type": { + "pattern": "https://schema.org/Person" + } + } + } +] diff --git a/src/test/resources/json/cvoc-skosmos.json b/src/test/resources/json/cvoc-skosmos.json new file mode 100644 index 00000000000..6d32b29f054 --- /dev/null +++ b/src/test/resources/json/cvoc-skosmos.json @@ -0,0 +1,69 @@ +[ + { + "field-name": "keyword", + "term-uri-field": "keywordTermURL", + "cvoc-url": "https://demo.skosmos.org/", + "js-url": "https://github.com/gdcc/dataverse-external-vocab-support/blob/main/scripts/skosmos.js", + "protocol": "skosmos", + "retrieval-uri": "https://demo.skosmos.org/rest/v1/data?uri={0}", + "term-parent-uri": "", + "allow-free-text": true, + "languages": "en, fr", + "vocabs": { + "agrovoc": { + "vocabularyUri": "http://aims.fao.org/vest-registry/kos/agrovoc", + "uriSpace": "http://aims.fao.org/aos/agrovoc/" + } + }, + "managed-fields": { + "vocabularyName": "keywordVocabulary", + "termName": "keywordValue", + "vocabularyUri": "keywordVocabularyURI" + }, + "retrieval-filtering": { + "@context": { + "termName": "https://schema.org/name", + "vocabularyName": "https://dataverse.org/schema/vocabularyName", + "vocabularyUri": "https://dataverse.org/schema/vocabularyUri", + "lang": "@language", + "value": "@value" + }, + "@id": { + "pattern": "{0}", + "params": [ + "@id" + ] + }, + "termName": { + "pattern": "{0}", + "params": [ + "/graph/uri=@id/prefLabel" + ] + }, + "vocabularyName": { + "pattern": "{0}", + "params": [ + "/graph/type=skos:ConceptScheme/prefLabel" + ] + }, + "vocabularyUri": { + "pattern": "{0}", + "params": [ + "/graph/type=skos:ConceptScheme/uri" + ] + }, + "synonyms": { + "pattern": "{0}", + "params": [ + "/graph/uri=@id/altLabel" + ] + }, + "genericTerm": { + "pattern": "{0}", + "params": [ + "/graph/type=skos:Concept/prefLabel" + ] + } + } + } +] From adf50744e42e44ad5f4f259e43c490859b7e8e0e Mon Sep 17 00:00:00 2001 From: Ludovic DANIEL Date: Thu, 2 May 2024 11:36:28 +0200 Subject: [PATCH 3/3] Update documentations related to PR 'CVOC : Indexed field accuracy (Ontoportal integration) #10505' --- doc/release-notes/9276-doc-cvoc-index-in.md | 16 +++++++++++++--- .../source/admin/metadatacustomization.rst | 6 ++++-- .../iq/dataverse/DatasetFieldServiceBean.java | 10 +++++----- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/doc/release-notes/9276-doc-cvoc-index-in.md b/doc/release-notes/9276-doc-cvoc-index-in.md index 5c4dd4ca10f..78289201511 100644 --- a/doc/release-notes/9276-doc-cvoc-index-in.md +++ b/doc/release-notes/9276-doc-cvoc-index-in.md @@ -2,7 +2,17 @@ ### Updates on Support for External Vocabulary Services -#### Indexed field accuracy +Multiple extensions of the External Vocabulary mechanism have been added. These extensions allow interaction with services based on the Ontoportal software and are expected to be generally useful for other service types. -For more relevant indexing, you can now map external vocabulary values to a `managed-fields` of a [:CVocConf setting](https://guides.dataverse.org/en/6.3/installation/config.html#cvocconf) by adding the key `indexIn` in `retrieval-filtering`. -For more information, please check [GDCC/dataverse-external-vocab-support documentation](https://github.com/gdcc/dataverse-external-vocab-support/tree/main/docs). \ No newline at end of file +These changes include: + +#### Improved Indexing with Compound Fields + +When using an external vocabulary service with compound fields, you can now specify which field(s) will include additional indexed information, such as translations of an entry into other languages. This is done by adding the `indexIn` in `retrieval-filtering`. (#10505) +For more information, please check [GDCC/dataverse-external-vocab-support documentation](https://github.com/gdcc/dataverse-external-vocab-support/tree/main/docs). + +#### Broader Support for Indexing Service Responses + +Indexing of the results from `retrieval-filtering` responses can now handle additional formats including Json Arrays of Strings and values from arbitrary keys within a JSON Object. (#10505) + +**** This documentation must be merged with 9276-allow-flexible-params-in-retrievaluri-cvoc.md (#10404) \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/metadatacustomization.rst b/doc/sphinx-guides/source/admin/metadatacustomization.rst index 66911aa0ad1..e70cf0e0897 100644 --- a/doc/sphinx-guides/source/admin/metadatacustomization.rst +++ b/doc/sphinx-guides/source/admin/metadatacustomization.rst @@ -552,6 +552,8 @@ Great care must be taken when reloading a metadata block. Matching is done on fi The ability to reload metadata blocks means that SQL update scripts don't need to be written for these changes. See also the :doc:`/developers/sql-upgrade-scripts` section of the Developer Guide. +.. _using-external-vocabulary-services: + Using External Vocabulary Services ---------------------------------- @@ -577,9 +579,9 @@ In general, the external vocabulary support mechanism may be a better choice for The specifics of the user interface for entering/selecting a vocabulary term and how that term is then displayed are managed by third-party Javascripts. The initial Javascripts that have been created provide auto-completion, displaying a list of choices that match what the user has typed so far, but other interfaces, such as displaying a tree of options for a hierarchical vocabulary, are possible. Similarly, existing scripts do relatively simple things for displaying a term - showing the term's name in the appropriate language and providing a link to an external URL with more information, but more sophisticated displays are possible. -Scripts supporting use of vocabularies from services supporting the SKOMOS protocol (see https://skosmos.org) and retrieving ORCIDs (from https://orcid.org) are available https://github.com/gdcc/dataverse-external-vocab-support. (Custom scripts can also be used and community members are encouraged to share new scripts through the dataverse-external-vocab-support repository.) +Scripts supporting use of vocabularies from services supporting the SKOMOS protocol (see https://skosmos.org), retrieving ORCIDs (from https://orcid.org), services based on Ontoportal product (see https://ontoportal.org/), and using ROR (https://ror.org/) are available https://github.com/gdcc/dataverse-external-vocab-support. (Custom scripts can also be used and community members are encouraged to share new scripts through the dataverse-external-vocab-support repository.) -Configuration involves specifying which fields are to be mapped, whether free-text entries are allowed, which vocabulary(ies) should be used, what languages those vocabulary(ies) are available in, and several service protocol and service instance specific parameters. +Configuration involves specifying which fields are to be mapped, to which Solr field they should be indexed, whether free-text entries are allowed, which vocabulary(ies) should be used, what languages those vocabulary(ies) are available in, and several service protocol and service instance specific parameters, including the ability to send HTTP headers on calls to the service. These are all defined in the :ref:`:CVocConf <:CVocConf>` setting as a JSON array. Details about the required elements as well as example JSON arrays are available at https://github.com/gdcc/dataverse-external-vocab-support, along with an example metadata block that can be used for testing. The scripts required can be hosted locally or retrieved dynamically from https://gdcc.github.io/ (similar to how dataverse-previewers work). diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java index b1717431e41..43648fa3b6d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java @@ -330,7 +330,7 @@ public Map getCVocConf(boolean byTermUriField){ logger.warning("Ignoring External Vocabulary setting for non-existent child field: " + managedFields.getString(s)); } else { - logger.info("Found: " + dft.getName()); + logger.fine("Found: " + dft.getName()); } } } @@ -372,10 +372,10 @@ public void registerExternalVocabValues(DatasetField df) { /** * Retrieves indexable strings from a cached externalvocabularyvalue entry filtered through retrieval-filtering configuration. *

- * This method externalvocabularyvalue entries have been filtered and contains a single JsonObject. - * Is handled : Strings, Array of Objects with "lang" and ("value" or "content") keys, Object with Strings as value or Object with Array of Strings as value. - * The string, or the "value/content"s for each language are added to the set. - * This method can retrieve string values to be indexed in term-uri-field (parameter defined in CVOC configuration) or in "indexIn" field (optional parameter of retrieval-filtering defined in CVOC configuration). + * This method assumes externalvocabularyvalue entries have been filtered and that they contain a single JsonObject. + * Cases Handled : A String, an Array of Strings, an Array of Objects with "value" or "content" keys, an Object with one or more entries that have String values or Array values with a set of String values. + * The string(s), or the "value/content"s for each language are added to the set. + * Retrieved string values are indexed in the term-uri-field (parameter defined in CVOC configuration) by default, or in the field specified by an optional "indexIn" parameter in the retrieval-filtering defined in the CVOC configuration. *

* Any parsing error results in no entries (there can be unfiltered entries with * unknown structure - getting some strings from such an entry could give fairly