Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #105: "multiple conformsTo values" #111

Merged
merged 24 commits into from
Jun 23, 2023
Merged
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ffaacf4
refactor(reader): reduce complexity
Pfeil May 5, 2023
8cf1d62
fix: avoid memory duplication
Pfeil May 8, 2023
b43c233
docs: move inline comment info to method docs
Pfeil May 8, 2023
8c4ce84
feat: run reader in parallel for large metadata
Pfeil May 8, 2023
7dbbde7
fix(docs): remove docs from old return value
Pfeil May 8, 2023
15878ab
test: read crate with multiple conformsTo values
Pfeil May 8, 2023
bfbdb13
refactor: move generic method into helper class
Pfeil May 23, 2023
8f70ab7
build: add build info to gradle output
Pfeil Jun 20, 2023
a4bd843
Merge remote-tracking branch 'origin/main' into 105-multiple-conforms…
Pfeil Jun 20, 2023
9377c7c
fix: possible exceptions when resolving fake IDs on certain runtime e…
Pfeil Jun 20, 2023
caee947
sonarlint: compare strings using equals
Pfeil Jun 20, 2023
82322f4
feat: enable reading of crates with multiple conformsTo values
Pfeil Jun 20, 2023
e81174b
test: export does not mess up v1.2-DRAFT descriptors
Pfeil Jun 20, 2023
c07f25b
feat: create crates according v1.2-DRAFT, including the multiple conf…
Pfeil Jun 21, 2023
c925b40
cleanup: imports, small todo
Pfeil Jun 21, 2023
21a43ae
fix: javadoc recognized comparison operators as html
Pfeil Jun 21, 2023
1f039dc
docs: add parameter description
Pfeil Jun 21, 2023
295a46a
feat: support draft features on existing crates
Pfeil Jun 23, 2023
6741fc7
fix(docs): inheritDoc in constructors does not work
Pfeil Jun 23, 2023
0d4fd82
refactor: simplify crate builders
Pfeil Jun 23, 2023
123f789
fix(docs): Add missing summary
Pfeil Jun 23, 2023
f6c7602
docs: state compatibility in README.md
Pfeil Jun 23, 2023
ad1e914
feat: convenience methods for comparing versions
Pfeil Jun 23, 2023
025c19e
fix: misleading variable names and useless checks
Pfeil Jun 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 113 additions & 57 deletions src/main/java/edu/kit/datamanager/ro_crate/reader/RoCrateReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
* This class allows reading crates from the outside into the library in order
Expand All @@ -31,6 +34,19 @@
*/
public class RoCrateReader {

private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";

protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";

protected static final String PROP_ABOUT = "about";
protected static final String PROP_CONTEXT = "@context";
protected static final String PROP_CONFORMS_TO = "conformsTo";
protected static final String PROP_GRAPH = "@graph";
protected static final String PROP_HAS_PART = "hasPart";
protected static final String PROP_ID = "@id";

private final ReaderStrategy reader;

public RoCrateReader(ReaderStrategy reader) {
Expand All @@ -45,32 +61,32 @@ public RoCrateReader(ReaderStrategy reader) {
* @return the read RO-crate
*/
public RoCrate readCrate(String location) {
RoCrate crate = new RoCrate();
// get the ro-crate-medata.json
ObjectNode metadataJson = reader.readMetadataJson(location);
// get the content of the crate
File files = reader.readContent(location);

// this set will contain the files that are associated with entities
HashSet<String> usedFiles = new HashSet<>();
usedFiles.add(new File(location).toPath().resolve("ro-crate-metadata.json").toFile().getPath());
usedFiles.add(new File(location).toPath().resolve("ro-crate-preview.html").toFile().getPath());
usedFiles.add(new File(location).toPath().resolve("ro-crate-preview_files").toFile().getPath());

JsonNode context = metadataJson.get("@context");

usedFiles.add(new File(location).toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
usedFiles.add(new File(location).toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
usedFiles.add(new File(location).toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
Pfeil marked this conversation as resolved.
Show resolved Hide resolved
JsonNode context = metadataJson.get(PROP_CONTEXT);
CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
RoCrate crate = new RoCrate();
crate.setMetadataContext(crateContext);
JsonNode graph = metadataJson.get("@graph");
JsonNode graph = metadataJson.get(PROP_GRAPH);

if (graph.isArray()) {

graph = setRootEntities(crate, (ArrayNode) graph);
for (JsonNode node : graph) {
// if the id is in the root hasPart list, we know this entity is a data entity
RootDataEntity root = crate.getRootDataEntity();
if (root != null && root.hasInHasPart(node.get("@id").asText())) {
File loc = checkFolderHasFile(node.get("@id").asText(), files);
if (root != null && root.hasInHasPart(node.get(PROP_ID).asText())) {
File loc = checkFolderHasFile(node.get(PROP_ID).asText(), files);
if (loc != null) {
usedFiles.add(loc.getPath());
}
Expand Down Expand Up @@ -100,62 +116,102 @@ public RoCrate readCrate(String location) {
return crate;
}

private File checkFolderHasFile(String id, File file) {
protected File checkFolderHasFile(String id, File file) {
Path path = file.toPath().resolve(URLDecoder.decode(id, StandardCharsets.UTF_8));
if (path.toFile().exists()) {
return path.toFile();
}
return null;
}

// gets the entities that every crate should have
// we will need the root dataset to distinguish between data entities and contextual entities
private ArrayNode setRootEntities(RoCrate crate, ArrayNode graph) {

// for now, we make an empty ArrayNode and putt all the entities
// that still need to be processed there
/**
* Sets the entities that every crate should have.
*
* Extracts the root data entity and the Metadata File Descriptor (JSON
* descriptor) from the graph and inserts them into the crate object.
* We will need the root dataset to distinguish between data entities and
* contextual entities.
*
* @param crate the crate, which will have the entities set, if available in the
* graph.
* @param graph the graph of the Metadata JSON file, where the entities are
* extracted from.
* @return the given graph, but without the root data entity and the Metadata
* File Descriptor (JSON descriptor).
*/
protected ArrayNode setRootEntities(RoCrate crate, ArrayNode graph) {
var graphCopy = graph.deepCopy();
// use the algorithm described here: https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
for (int i = 0; i < graph.size(); i++) {
JsonNode node = graph.get(i);
JsonNode type = node.get("conformsTo");
if (type != null) {
String uri = type.get("@id").asText();
if (uri.startsWith("https://w3id.org/ro/crate/")) {
crate.setJsonDescriptor(
new ContextualEntity.ContextualEntityBuilder().setAll(node.deepCopy()).build());
graphCopy.remove(i);
String id = node.get("about").get("@id").asText();
for (int j = 0; j < graphCopy.size(); j++) {
ObjectNode secondIteration = graphCopy.get(j).deepCopy();
if (secondIteration.get("@id").asText().equals(id)) {
// root data entity
JsonNode hasPartNode = secondIteration.get("hasPart");
Set<String> hasPartSet = new HashSet<>();
if (hasPartNode != null) {
if (hasPartNode.isArray()) {
for (var e : hasPartNode) {
hasPartSet.add(e.get("@id").asText());
}
} else if (hasPartNode.isObject()) {
hasPartSet.add(hasPartNode.get("@id").asText());
}
}
secondIteration.remove("hasPart");
crate.setRootDataEntity(
new RootDataEntity.RootDataEntityBuilder()
.setAll(secondIteration.deepCopy())
.setHasPart(hasPartSet)
.build()
);
graphCopy.remove(j);
break;
}
}
}
// use the algorithm described here:
// https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), false)
// "2. if the conformsTo property is a URI that starts with
// https://w3id.org/ro/crate/"
.filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
Pfeil marked this conversation as resolved.
Show resolved Hide resolved
// "3. from this entity’s about object keep the @id URI as variable root"
.filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
// There should be only one descriptor. If multiple exist, we take the first
// one.
.findFirst();

maybeDescriptor.ifPresent(descriptor -> {
removeJsonNodeFromArrayNode(graphCopy, descriptor);
setCrateDescriptor(crate, descriptor);

Optional<ObjectNode> maybeRoot = extractRoot(graphCopy, descriptor);

maybeRoot.ifPresent(root -> {
Set<String> hasPartIds = extractHasPartIds(root);

crate.setRootDataEntity(
new RootDataEntity.RootDataEntityBuilder()
.setAll(root.deepCopy())
.setHasPart(hasPartIds)
.build());

removeJsonNodeFromArrayNode(graphCopy, root);
});
});

return graphCopy;
}

private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
return StreamSupport.stream(graph.spliterator(), false)
// convert to object note, if it is an object
.filter(JsonNode::isObject)
.map(JsonNode::<ObjectNode>deepCopy)
// "5. if the entity has an @id URI that matches root return it"
.filter(node -> node.path(PROP_ID).asText().equals(rootId))
.findFirst();
}

private Set<String> extractHasPartIds(ObjectNode root) {
JsonNode hasPartNode = root.path(PROP_HAS_PART);
Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), false)
.map(hasPart -> hasPart.path(PROP_ID).asText())
.filter(text -> !text.isBlank())
.collect(Collectors.toSet());
if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
hasPartIds.add(hasPartNode.path(PROP_ID).asText());
}
return hasPartIds;
}

private void removeJsonNodeFromArrayNode(ArrayNode array, JsonNode node) {
for (int i = 0; i < array.size(); i++) {
if (array.get(i).equals(node)) {
array.remove(i);
return;
}
}
return graphCopy;
}

private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
.setAll(descriptor.deepCopy())
.build();
crate.setJsonDescriptor(descriptorEntity);
}
}