From 0e8e70bd4cdee81ab97b50dd66b70bc197ad259e Mon Sep 17 00:00:00 2001 From: Jerven Bolleman Date: Mon, 16 Oct 2023 10:20:01 +0200 Subject: [PATCH] Prepare for maven central --- - | 91 --- LICENSE.md | 20 + README.md | 8 +- pom.xml | 398 +++++++--- .../sib/swissprot/handlegraph4jrdf/FALDO.java | 385 +++++---- .../swissprot/handlegraph4jrdf/GFA2RDF.java | 748 ++++++++++-------- .../sib/swissprot/handlegraph4jrdf/VG.java | 143 +++- .../handlegraph4jrdf/GFA2RDFTest.java | 31 +- 8 files changed, 1048 insertions(+), 776 deletions(-) delete mode 100644 - create mode 100644 LICENSE.md diff --git a/- b/- deleted file mode 100644 index 16cdff4..0000000 --- a/- +++ /dev/null @@ -1,91 +0,0 @@ -@base . -@prefix rdf: . -@prefix vg: . -@prefix node: . -@prefix faldo: . -node:1 a vg:Node; -rdf:value "caaataag" . -node:2 a vg:Node; -rdf:value "a" . -node:3 a vg:Node; -rdf:value "g" . -node:4 a vg:Node; -rdf:value "t" . -node:5 a vg:Node; -rdf:value "c" . -node:6 a vg:Node; -rdf:value "ttg" . -node:7 a vg:Node; -rdf:value "a" . -node:8 a vg:Node; -rdf:value "g" . -node:9 a vg:Node; -rdf:value "aaattttctggagttctat" . -node:10 a vg:Node; -rdf:value "a" . -node:11 a vg:Node; -rdf:value "t" . -node:12 a vg:Node; -rdf:value "atat" . -node:13 a vg:Node; -rdf:value "a" . -node:14 a vg:Node; -rdf:value "t" . -node:15 a vg:Node; -rdf:value "ccaactctctg" . - a vg:Path . -@prefix path0: . -path0:0 a vg:Step,faldo:Region; -vg:path ; -vg:rank 0; -vg:node node:1 . -path0:1 a vg:Step,faldo:Region; -vg:path ; -vg:rank 1; -vg:node node:3 . -path0:2 a vg:Step,faldo:Region; -vg:path ; -vg:rank 2; -vg:node node:5 . -path0:3 a vg:Step,faldo:Region; -vg:path ; -vg:rank 3; -vg:node node:6 . -path0:4 a vg:Step,faldo:Region; -vg:path ; -vg:rank 4; -vg:node node:8 . -path0:5 a vg:Step,faldo:Region; -vg:path ; -vg:rank 5; -vg:node node:9 . -path0:6 a vg:Step,faldo:Region; -vg:path ; -vg:rank 6; -vg:node node:11 . -path0:7 a vg:Step,faldo:Region; -vg:path ; -vg:rank 7; -vg:node node:12 . -path0:8 a vg:Step,faldo:Region; -vg:path ; -vg:rank 8; -vg:node node:14 . -path0:9 a vg:Step,faldo:Region; -vg:path ; -vg:rank 9; -vg:node node:15 . -node:1 vg:linksForwardToForward node:2,node:3 . -node:2 vg:linksForwardToForward node:4,node:5 . -node:3 vg:linksForwardToForward node:4,node:5 . -node:4 vg:linksForwardToForward node:6 . -node:5 vg:linksForwardToForward node:6 . -node:6 vg:linksForwardToForward node:7,node:8 . -node:7 vg:linksForwardToForward node:9 . -node:8 vg:linksForwardToForward node:9 . -node:9 vg:linksForwardToForward node:10,node:11 . -node:10 vg:linksForwardToForward node:12 . -node:11 vg:linksForwardToForward node:12 . -node:12 vg:linksForwardToForward node:13,node:14 . -node:13 vg:linksForwardToForward node:15 . -node:14 vg:linksForwardToForward node:15 . diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..4922fe8 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2020 SIB Swiss Institute of Bioinformatics + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 1180751..1c0cf7a 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,12 @@ a RDF graph. To make a fat jar to run the program ``` -mvn assembly:assembly +mvn package ``` -Requires local install of handlegraph4j and handlegraph4jGFA +Then run with +``` +java -jar target/handlegraph4j-rdf-0.1-jar-with-dependencies.jar + +``` diff --git a/pom.xml b/pom.xml index 3956d64..9c6eaac 100644 --- a/pom.xml +++ b/pom.xml @@ -1,133 +1,271 @@ - - 4.0.0 - swiss.sib.swissprot - handlegraph4j-rdf - 1.0-SNAPSHOT - jar - - UTF-8 - 20.2.0 - 4.0.0-M2 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - 17 - true - - - info.picocli - picocli-codegen - 4.5.2 - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.0 - - false - - - - org.apache.maven.plugins - maven-assembly-plugin - - - jar-with-dependencies - - - - sib.swiss.swissprot.handlegraph4jrdf.GFA2RDF - - - - - - - - - info.picocli - picocli - 4.5.2 - - - io.github.vgteam - handlegraph4j-gfa - 1.0-SNAPSHOT - - - - org.eclipse.rdf4j - rdf4j-client - ${eclipse.rdf4j.version} - pom - - - org.junit.jupiter - junit-jupiter-api - 5.6.0 - test - - - org.junit.jupiter - junit-jupiter-params - 5.6.0 - test - - - org.junit.jupiter - junit-jupiter-engine - 5.6.0 - test - - - org.eclipse.collections - eclipse-collections-api - 10.4.0 - - - org.eclipse.collections - eclipse-collections - 10.4.0 - - - org.graalvm.sdk - graal-sdk - ${graalvm.version} - provided - - - - - - github - GitHub Jerven Bolleman Apache Maven Packages - https://maven.pkg.github.com/JervenBolleman/handlegraph4jRDF - - + + 4.0.0 + io.github.jervenbolleman + handlegraph4j-rdf + 0.1 + jar + + + The MIT License (MIT) + + https://github.com/JervenBolleman/handlegraph4jRDF/blob/main/LICENSE.md + repo + + + + + jerven + Jerven Bolleman + jerven.bolleman@sib.swiss + SIB Swiss Institute of Bioinformatics + https://www.sib.swiss + + developer + + + + + UTF-8 + 4.3.7 + - - - central - https://repo1.maven.org/maven2 - true - true - - - github - GitHub Jerven Bolleman handlegraph4j Apache Maven Packages - https://maven.pkg.github.com/jervenbolleman/handlegraph4jGFA - true - true - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + 17 + true + + + info.picocli + picocli-codegen + 4.7.5 + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.1 + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + package + + shade + + + true + jar-with-dependencies + + + + + swiss.sib.swissprot.handlegraph4jrdf.GFA2RDF + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.1.2 + + false + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + + jar-with-dependencies + + + + + sib.swiss.swissprot.handlegraph4jrdf.GFA2RDF + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.13 + true + + ossrh + https://oss.sonatype.org/ + false + + + + org.apache.maven.plugins + maven-source-plugin + 3.3.0 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.6.0 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 3.1.0 + + + sign-artifacts + verify + + sign + + + + + ${gpg.keyname} + ${gpg.keyname} + + + + + + + info.picocli + picocli + 4.7.5 + + + io.github.jervenbolleman + handlegraph4j-gfa + 1.2 + + + + org.eclipse.rdf4j + rdf4j-rio-binary + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-datatypes + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-languages + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-n3 + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-nquads + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-ntriples + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-rdfjson + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-rdfxml + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-trig + ${eclipse.rdf4j.version} + + + org.eclipse.rdf4j + rdf4j-rio-trix + ${eclipse.rdf4j.version} + + + org.junit.jupiter + junit-jupiter-api + 5.10.0 + test + + + org.junit.jupiter + junit-jupiter-params + 5.10.0 + test + + + org.junit.jupiter + junit-jupiter-engine + 5.10.0 + test + + + org.eclipse.collections + eclipse-collections-api + 11.1.0 + + + org.eclipse.collections + eclipse-collections + 11.1.0 + + + handlegraph4jRDF + https://github.com/jervenbolleman/handlegraph4jRDF + A set of java classes to describe a VG variation graph in RDF + plus GFA1 to RDF convertor. + + https://github.com/jervenbolleman/handlegraph4jRDF + Jerven Bolleman + diff --git a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/FALDO.java b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/FALDO.java index 84a3f59..1933399 100644 --- a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/FALDO.java +++ b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/FALDO.java @@ -1,7 +1,25 @@ /* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. + * The MIT License + * + * Copyright 2020 Jerven Bolleman . + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ package swiss.sib.swissprot.handlegraph4jrdf; @@ -9,175 +27,202 @@ import org.eclipse.rdf4j.model.impl.SimpleValueFactory; /** + * FALDO is a set of properties and classes to describe positions along a linear + * sequence. * - * @author Jerven Bolleman + * @see The FALDO Description + * in OWL + * @author Jerven Bolleman */ public class FALDO { - public static final String NAMESPACE = "http://biohackathon.org/resource/faldo#"; - public static final String PREFIX = "faldo"; - private static final SimpleValueFactory VF = SimpleValueFactory.getInstance(); - - /** - * Used to describe a location that consists of a number of Regions but - * where the order is not known. e.g. the oddly named order() keyword in a - * INSDC file. * - */ - public static final IRI BagOfRegions = VF.createIRI(NAMESPACE, "BagOfRegions"); - - /** - * The 'both strands position' indicates a region that is best described as - * being on 'both' strands of a double-stranded sequence, rather than one or - * the other. * - */ - public static final IRI BothStrandsPosition = VF.createIRI(NAMESPACE, "BothStrandsPosition"); - - /** - * Sometimes a location of a feature is defined by a collection of regions - * e.g. join() and order() in INSDC records. One should always try to model - * the semantics more accurately than this, these are fallback options to - * encode legacy data. * - */ - public static final IRI CollectionOfRegions = VF.createIRI(NAMESPACE, "CollectionOfRegions"); - - /** - * A position that is exactly known. * - */ - public static final IRI ExactPosition = VF.createIRI(NAMESPACE, "ExactPosition"); - public static final IRI N = VF.createIRI(NAMESPACE, "N-TerminalPosition"); - - public static final IRI C = VF.createIRI(NAMESPACE, "C-TerminalPosition"); - - /** - * The position is on the forward (positive, 5' to 3') strand. Shown as a - * '+' in GFF3 and GTF. * - */ - public static final IRI ForwardStrandPosition = VF.createIRI(NAMESPACE, "ForwardStrandPosition"); - - /** - * A position that lacks exact data. * - */ - public static final IRI FuzzyPosition = VF.createIRI(NAMESPACE, "FuzzyPosition"); - - /** - * This indicates that a feature is between two other positions that are - * both known exactly and that are next to each other. An example is a - * restriction enzyme cutting site. The cut is after one position and before - * the other position (hence, in between). * - */ - public static final IRI InBetweenPosition = VF.createIRI(NAMESPACE, "InBetweenPosition"); - - /** - * Use when you have an idea of the range in which you can find the - * position, but you cannot be sure about the exact position. * - */ - public static final IRI InRangePosition = VF.createIRI(NAMESPACE, "InRangePosition"); - - /** - * As an ordered list of regions (but the list might not be - * complete)."^^xsd:string , "Should be used when the location of a region - * is defined by an ordered list of Regions. However, try to avoid using - * these types in favor of using more explicit semantics about why the order - * is important. * - */ - public static final IRI ListOfRegions = VF.createIRI(NAMESPACE, "ListOfRegions"); - - /** - * The position is known to be one of the more detailed positions listed by - * the location predicate. * - */ - public static final IRI OneOfPosition = VF.createIRI(NAMESPACE, "OneOfPosition"); - - /** - * Superclass for the general concept of a position on a sequence. The - * sequence is designated with the reference predicate. * - */ - public static final IRI Position = VF.createIRI(NAMESPACE, "Position"); - - /** - * A region describes a length of sequence with a start position and end - * position that represents a feature on a sequence, e.g. a gene. * - */ - public static final IRI Region = VF.createIRI(NAMESPACE, "Region"); - - /** - * The position is on the reverse (complement, 3' to 5') strand of the - * sequence. Shown as '-' in GTF and GFF3. * - */ - public static final IRI ReverseStrandPosition = VF.createIRI(NAMESPACE, "ReverseStrandPosition"); - - /** - * Part of the coordinate system denoting on which strand the feature can be - * found. If you do not yet know which stand the feature is on, you should - * tag the position with just this class. If you know more you should use - * one of the subclasses. This means a region described with a '.' in GFF3. - * A GFF3 unstranded position does not have this type in FALDO -- those are - * just a 'position'. * - */ - public static final IRI StrandedPosition = VF.createIRI(NAMESPACE, "StrandedPosition"); - - /** - * This predicate is used when you want to describe a non-inclusive range. - * Only used in the InBetweenPosition to say it is after a nucleotide, but - * before the next one. * - */ - public static final IRI after = VF.createIRI(NAMESPACE, "after"); - - /** - * This predicate is used to indicate that the feature is found before the - * exact position. Use to indicate, for example, a cleavage site. The - * cleavage happens between two amino acids before one and after the other. - * * - */ - public static final IRI before = VF.createIRI(NAMESPACE, "before"); - - /** - * The inclusive beginning of a position. Also known as start. * - */ - public static final IRI begin = VF.createIRI(NAMESPACE, "begin"); - - /** - * This is the inverse of the begin:property. It is included to make it - * easier to write a number of OWL axioms. You should rarely use this in - * your raw data. * - */ - public static final IRI beginOf = VF.createIRI(NAMESPACE, "beginOf"); - - /** - * The inclusive end of the position. * - */ - public static final IRI end = VF.createIRI(NAMESPACE, "end"); - - /** - * This is the inverse of the begin:end. It is included to make it easier to - * write a number of OWL axioms. You should rarely use this in your raw - * data. * - */ - public static final IRI endOf = VF.createIRI(NAMESPACE, "endOf"); - /** - * This is the link between the concept whose location you are annotating - * and its range or position. For example, when annotating the region that - * describes an exon, the exon would be the subject and the region would be - * the object of the triple or: 'active site' 'location' [is] 'position 3'. - * * - */ - public static final IRI location = VF.createIRI(NAMESPACE, "location"); - - /** - * Denoted in 1-based closed coordinates, i.e. the position on the first - * amino acid or nucleotide of a sequence has the value 1. For nucleotide - * sequences we count from the 5'end of the sequence, while for Aminoacid - * sequences we start counting from the N-Terminus."^^xsd:string , "The - * position value is the offset along the reference where this position is - * found. Thus the only the position value in combination with the reference - * determines where a position is. * - */ - public static final IRI position = VF.createIRI(NAMESPACE, "position"); - - /** - * One of the possible positions listed for a OneOfPosition element. * - */ - public static final IRI possiblePosition = VF.createIRI(NAMESPACE, "possiblePosition"); - - public static final IRI reference = VF.createIRI(NAMESPACE, "reference"); + private FALDO() { + + } + + /** + * The FALDO namespace + */ + public static final String NAMESPACE = "http://biohackathon.org/resource/faldo#"; + /** + * Prefered prefix for FALDO "faldo" + */ + public static final String PREFIX = "faldo"; + private static final SimpleValueFactory VF = SimpleValueFactory.getInstance(); + + /** + * Used to describe a location that consists of a number of Regions but where + * the order is not known. e.g. the oddly named order() keyword in a INSDC file. + * * + */ + public static final IRI BagOfRegions = VF.createIRI(NAMESPACE, "BagOfRegions"); + + /** + * The 'both strands position' indicates a region that is best described as + * being on 'both' strands of a double-stranded sequence, rather than one or the + * other. * + */ + public static final IRI BothStrandsPosition = VF.createIRI(NAMESPACE, "BothStrandsPosition"); + + /** + * Sometimes a location of a feature is defined by a collection of regions e.g. + * join() and order() in INSDC records. One should always try to model the + * semantics more accurately than this, these are fallback options to encode + * legacy data. * + */ + public static final IRI CollectionOfRegions = VF.createIRI(NAMESPACE, "CollectionOfRegions"); + + /** + * A position that is exactly known. * + */ + public static final IRI ExactPosition = VF.createIRI(NAMESPACE, "ExactPosition"); + + /** + * The position of the starting amino-acid a protein or polypeptide terminated + * by an amino acid with a free amine group (-NH2). The convention for writing + * peptide sequences is to put the N-terminus on the left and write the sequence + * from N- to C-terminus. Instances of this class are often used when the + * reference sequence is not complete + */ + public static final IRI N = VF.createIRI(NAMESPACE, "N-TerminalPosition"); + + /** + * The C-terminus is the end of an amino acid chain (protein or polypeptide), + * terminated by a free carboxyl group (-COOH). + */ + public static final IRI C = VF.createIRI(NAMESPACE, "C-TerminalPosition"); + + /** + * The position is on the forward (positive, 5' to 3') strand. Shown as a '+' in + * GFF3 and GTF. * + */ + public static final IRI ForwardStrandPosition = VF.createIRI(NAMESPACE, "ForwardStrandPosition"); + + /** + * A position that lacks exact data. * + */ + public static final IRI FuzzyPosition = VF.createIRI(NAMESPACE, "FuzzyPosition"); + + /** + * This indicates that a feature is between two other positions that are both + * known exactly and that are next to each other. An example is a restriction + * enzyme cutting site. The cut is after one position and before the other + * position (hence, in between). * + */ + public static final IRI InBetweenPosition = VF.createIRI(NAMESPACE, "InBetweenPosition"); + + /** + * Use when you have an idea of the range in which you can find the position, + * but you cannot be sure about the exact position. * + */ + public static final IRI InRangePosition = VF.createIRI(NAMESPACE, "InRangePosition"); + + /** + * As an ordered list of regions (but the list might not be + * complete)."^^xsd:string , "Should be used when the location of a region is + * defined by an ordered list of Regions. However, try to avoid using these + * types in favor of using more explicit semantics about why the order is + * important. * + */ + public static final IRI ListOfRegions = VF.createIRI(NAMESPACE, "ListOfRegions"); + + /** + * The position is known to be one of the more detailed positions listed by the + * location predicate. * + */ + public static final IRI OneOfPosition = VF.createIRI(NAMESPACE, "OneOfPosition"); + + /** + * Superclass for the general concept of a position on a sequence. The sequence + * is designated with the reference predicate. * + */ + public static final IRI Position = VF.createIRI(NAMESPACE, "Position"); + + /** + * A region describes a length of sequence with a start position and end + * position that represents a feature on a sequence, e.g. a gene. * + */ + public static final IRI Region = VF.createIRI(NAMESPACE, "Region"); + + /** + * The position is on the reverse (complement, 3' to 5') strand of the sequence. + * Shown as '-' in GTF and GFF3. * + */ + public static final IRI ReverseStrandPosition = VF.createIRI(NAMESPACE, "ReverseStrandPosition"); + + /** + * Part of the coordinate system denoting on which strand the feature can be + * found. If you do not yet know which stand the feature is on, you should tag + * the position with just this class. If you know more you should use one of the + * subclasses. This means a region described with a '.' in GFF3. A GFF3 + * unstranded position does not have this type in FALDO -- those are just a + * 'position'. * + */ + public static final IRI StrandedPosition = VF.createIRI(NAMESPACE, "StrandedPosition"); + + /** + * This predicate is used when you want to describe a non-inclusive range. Only + * used in the InBetweenPosition to say it is after a nucleotide, but before the + * next one. * + */ + public static final IRI after = VF.createIRI(NAMESPACE, "after"); + + /** + * This predicate is used to indicate that the feature is found before the exact + * position. Use to indicate, for example, a cleavage site. The cleavage happens + * between two amino acids before one and after the other. * + */ + public static final IRI before = VF.createIRI(NAMESPACE, "before"); + + /** + * The inclusive beginning of a position. Also known as start. * + */ + public static final IRI begin = VF.createIRI(NAMESPACE, "begin"); + + /** + * This is the inverse of the begin:property. It is included to make it easier + * to write a number of OWL axioms. You should rarely use this in your raw data. + * * + */ + public static final IRI beginOf = VF.createIRI(NAMESPACE, "beginOf"); + + /** + * The inclusive end of the position. * + */ + public static final IRI end = VF.createIRI(NAMESPACE, "end"); + + /** + * This is the inverse of the begin:end. It is included to make it easier to + * write a number of OWL axioms. You should rarely use this in your raw data. * + */ + public static final IRI endOf = VF.createIRI(NAMESPACE, "endOf"); + /** + * This is the link between the concept whose location you are annotating and + * its range or position. For example, when annotating the region that describes + * an exon, the exon would be the subject and the region would be the object of + * the triple or: 'active site' 'location' [is] 'position 3'. * + */ + public static final IRI location = VF.createIRI(NAMESPACE, "location"); + + /** + * Denoted in 1-based closed coordinates, i.e. the position on the first amino + * acid or nucleotide of a sequence has the value 1. For nucleotide sequences we + * count from the 5'end of the sequence, while for Aminoacid sequences we start + * counting from the N-Terminus."^^xsd:string , "The position value is the + * offset along the reference where this position is found. Thus the only the + * position value in combination with the reference determines where a position + * is. * + */ + public static final IRI position = VF.createIRI(NAMESPACE, "position"); + + /** + * One of the possible positions listed for a OneOfPosition element. * + */ + public static final IRI possiblePosition = VF.createIRI(NAMESPACE, "possiblePosition"); + + /** + * The reference is the resource that the position value is anchored to. For + * example, a contig or chromosome in a genome assembly. + */ + public static final IRI reference = VF.createIRI(NAMESPACE, "reference"); } diff --git a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDF.java b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDF.java index ba2e256..8319dbd 100644 --- a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDF.java +++ b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDF.java @@ -1,28 +1,44 @@ /* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. + * The MIT License + * + * Copyright 2020 Jerven Bolleman . + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ package swiss.sib.swissprot.handlegraph4jrdf; -import io.github.vgteam.handlegraph4j.gfa1.GFA1Reader; -import io.github.vgteam.handlegraph4j.gfa1.line.Line; -import io.github.vgteam.handlegraph4j.gfa1.line.LinkLine; -import io.github.vgteam.handlegraph4j.gfa1.line.PathLine; -import io.github.vgteam.handlegraph4j.gfa1.line.PathLine.Step; -import io.github.vgteam.handlegraph4j.gfa1.line.SegmentLine; +import static java.nio.charset.StandardCharsets.US_ASCII; + import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; -import static java.nio.charset.StandardCharsets.US_ASCII; import java.nio.file.Files; import java.util.Iterator; +import java.util.List; import java.util.concurrent.Callable; import java.util.stream.Stream; -import org.eclipse.collections.impl.map.mutable.primitive.IntIntHashMap; + +import org.eclipse.collections.impl.map.mutable.primitive.LongIntHashMap; import org.eclipse.collections.impl.map.mutable.primitive.ObjectIntHashMap; import org.eclipse.rdf4j.common.net.ParsedIRI; import org.eclipse.rdf4j.model.IRI; @@ -33,350 +49,392 @@ import org.eclipse.rdf4j.model.impl.SimpleIRI; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFWriter; +import org.eclipse.rdf4j.rio.Rio; +import org.eclipse.rdf4j.rio.UnsupportedRDFormatException; import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings; import org.eclipse.rdf4j.rio.turtle.TurtleWriter; + +import io.github.jervenbolleman.handlegraph4j.gfa1.GFA1Reader; +import io.github.jervenbolleman.handlegraph4j.gfa1.line.Line; +import io.github.jervenbolleman.handlegraph4j.gfa1.line.LinkLine; +import io.github.jervenbolleman.handlegraph4j.gfa1.line.PathLine; +import io.github.jervenbolleman.handlegraph4j.gfa1.line.PathLine.Step; +import io.github.jervenbolleman.handlegraph4j.gfa1.line.SegmentLine; import picocli.CommandLine; +import picocli.CommandLine.Command; import picocli.CommandLine.Option; import picocli.CommandLine.Parameters; -import picocli.CommandLine.Command; /** - * - * @author Jerven Bolleman + * Convert GFA1 to RDF. + * + * @author Jerven Bolleman */ -@Command(name = "gfa1toRdf", mixinStandardHelpOptions = true, version = "gfa1tot2rdf 0.0.1", - description = "Prints the equivalent RDF for a GFA1 file") +@Command(name = "gfa1toRdf", mixinStandardHelpOptions = true, version = "gfa1tot2rdf 0.1", description = "Prints the equivalent RDF for a GFA1 file") public class GFA2RDF implements Callable { - private static final ValueFactory VF = SimpleValueFactory.getInstance(); - private final IntIntHashMap nodeLengthMapByIntId = new IntIntHashMap(); - private final ObjectIntHashMap nodeLengthMapByByteArrayId = new ObjectIntHashMap<>(); - - @Parameters(index = "0", description = "The GFA file to translate to RDF") - private File inputFile; - - @Parameters(index = "1", description = "The output file to write RDF too") - private File outputFile; - - @Option(names = {"-b", "--base"}, description = "Base IRI of this graph") - private String base = "http://example.org/vg/"; - - @Option(names = {"-s", "--short"}, description = "try to generate shorter text, and leave out inferable triples") - boolean preCompress; - - @Option(names = {"-e", "--extra"}, description = "try to all triples possible") - boolean extra; - - @Override - public Integer call() throws Exception { // your business logic goes here... - ParsedIRI baseIRI = new ParsedIRI(base); - try ( OutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile))) { - try ( Stream s = Files.lines(inputFile.toPath(), StandardCharsets.US_ASCII)) { - writeConvertedToOutputStream(out, baseIRI, s); - } - } - return 0; - } - - void writeConvertedToOutputStream(final OutputStream out, ParsedIRI baseIRI, Stream s) throws RDFHandlerException { - TurtleWriter tw = new PrefixedURITurtleWriter(out, baseIRI); - tw.set(BasicWriterSettings.PRETTY_PRINT, false); - tw.startRDF(); - String nodePrefix; - if (preCompress) { - tw.handleNamespace("r", RDF.NAMESPACE); - tw.handleNamespace("", VG.NAMESPACE); - nodePrefix = "n"; - tw.handleNamespace("S", VG.Step.stringValue()); - tw.handleNamespace("N", VG.Node.stringValue()); - tw.handleNamespace("rv", RDF.VALUE.stringValue()); - tw.handleNamespace("ff", VG.linksForwardToForward.stringValue()); - tw.handleNamespace("fr", VG.linksForwardToReverse.stringValue()); - tw.handleNamespace("rf", VG.linksReverseToForward.stringValue()); - tw.handleNamespace("rr", VG.linksReverseToReverse.stringValue()); - tw.handleNamespace("l", VG.links.stringValue()); - tw.handleNamespace("sp", VG.path.stringValue()); - tw.handleNamespace("sr", VG.rank.stringValue()); - tw.handleNamespace("sn", VG.node.stringValue()); - tw.handleNamespace("f", FALDO.NAMESPACE); - tw.handleNamespace("fep", FALDO.ExactPosition.stringValue()); - tw.handleNamespace("p", FALDO.position.stringValue()); - } else { - tw.handleNamespace(RDF.PREFIX, RDF.NAMESPACE); - tw.handleNamespace(VG.PREFIX, VG.NAMESPACE); - nodePrefix = "node"; - tw.handleNamespace(FALDO.PREFIX, FALDO.NAMESPACE); - } - tw.handleNamespace(nodePrefix, base + "node/"); - Iterator si = s.iterator(); - convert(si, tw, nodePrefix); - - tw.endRDF(); - } - - public void convert(Iterator si, TurtleWriter tw, String nodePrefix) { - int pathCounter = 0; - GFA1Reader gfA1Reader = new GFA1Reader(si); - while (gfA1Reader.hasNext()) { - Line line = gfA1Reader.next(); - pathCounter = convertLineToRdf(line, tw, pathCounter, nodePrefix); - } - } - - public static void main(String[] args) { - int exitCode = new CommandLine(new GFA2RDF()).execute(args); - System.exit(exitCode); - - } - - private int convertLineToRdf(Line line, RDFWriter tw, int pathCounter, String nodePrefix) { - switch (line.getCode()) { - case PathLine.CODE: - return convertPathLineToRdf((PathLine) line, tw, pathCounter, nodePrefix); - case SegmentLine.CODE: - convertSegmentLineToRdf((SegmentLine) line, tw, nodePrefix); - return pathCounter; - case LinkLine.CODE: - convertLinkLineToRdf((LinkLine) line, tw, nodePrefix); - return pathCounter; - default: - return pathCounter; - } - } - - private int convertPathLineToRdf(PathLine pathLine, RDFWriter writer, int pathCounter, String nodePrefix) { - String nameAsString = pathLine.getNameAsString(); - IRI pathIRI; - String pathPrefix; - String pathStepPrefix; - String pathPositionPrefix; - if (preCompress) { - pathPrefix = "pn"; - pathStepPrefix = "ps"; - pathPositionPrefix = "pp"; - } else { - pathPrefix = "path"; - pathStepPrefix = "pathstep"; - pathPositionPrefix = "pathposition" ; - } - if (nameAsString.startsWith("http://") || nameAsString.startsWith("ftp://") || nameAsString.startsWith("https://")) { - pathIRI = new PrefixedIRI(pathPrefix, nameAsString, ""); - } else { - pathIRI = new PrefixedIRI(pathPrefix, base + "path/", nameAsString); - } - - String pathStepBase = pathIRI.stringValue() + "/step/"; - String pathPositionBase = pathIRI.stringValue() + "/position/"; - - writer.handleNamespace(pathPrefix, pathIRI.stringValue()); - writer.handleNamespace(pathStepPrefix, pathStepBase); - if (extra) { - writer.handleNamespace(pathPositionPrefix, pathPositionBase); - } - writer.handleStatement(VF.createStatement(pathIRI, RDF.TYPE, VG.Path)); - Iterator steps = pathLine.steps(); - int begin = 1; // We start at position 1. - while (steps.hasNext()) { - Step step = steps.next(); - writeStep(step, pathStepBase, writer, pathIRI, begin, pathPositionBase, nodePrefix, pathPrefix, pathStepPrefix, pathPositionPrefix); - } - if (writer instanceof PrefixedURITurtleWriter) { - PrefixedURITurtleWriter tw = (PrefixedURITurtleWriter) writer; - tw.unsetNamespace(pathIRI.stringValue()); - tw.unsetNamespace(pathStepBase); - tw.unsetNamespace(pathPositionBase); - } - return pathCounter++; - } - - void writeStep(Step step, String pathStepBase, RDFWriter tw, IRI pathIRI, int begin, String pathPositionBase, String nodePrefix, String pathPrefix, String pathStepPrefix, String pathPositionPrefix) throws RDFHandlerException { - long rank = step.rank(); - IRI stepIRI = new PrefixedIRI(pathStepPrefix, pathStepBase, Long.toString(rank)); - if (!preCompress) { - tw.handleStatement(VF.createStatement(stepIRI, RDF.TYPE, VG.Step)); - tw.handleStatement(VF.createStatement(stepIRI, RDF.TYPE, FALDO.Region)); - } - tw.handleStatement(VF.createStatement(stepIRI, VG.path, pathIRI)); - if (rank < Integer.MAX_VALUE) { - tw.handleStatement(VF.createStatement(stepIRI, VG.rank, VF.createLiteral((int) rank))); - } else { - tw.handleStatement(VF.createStatement(stepIRI, VG.rank, VF.createLiteral(rank))); - } - tw.handleStatement(VF.createStatement(stepIRI, VG.node, createNodeId(nodePrefix, new String(step.nodeId(), US_ASCII)))); - if (extra) { - int end = writePositions(begin, step, pathPositionBase, tw, stepIRI, pathPositionPrefix); - begin = end; - } - } - - int writePositions(int begin, Step step, String pathPositionBase, RDFWriter tw, IRI stepIRI, String pathPositionPrefix) throws RDFHandlerException { - int end = begin + getNodeLengthOfStep(step); - IRI beginIri = new PrefixedIRI(pathPositionPrefix, pathPositionBase, String.valueOf(begin)); - IRI endIri = new PrefixedIRI(pathPositionPrefix, pathPositionBase, String.valueOf(end)); - tw.handleStatement(VF.createStatement(stepIRI, FALDO.begin, beginIri)); - tw.handleStatement(VF.createStatement(stepIRI, FALDO.end, endIri)); - if (!preCompress) { - tw.handleStatement(VF.createStatement(beginIri, RDF.TYPE, FALDO.Position)); - } - tw.handleStatement(VF.createStatement(beginIri, RDF.TYPE, FALDO.ExactPosition)); - tw.handleStatement(VF.createStatement(beginIri, FALDO.position, VF.createLiteral(begin))); - if (!preCompress) { - tw.handleStatement(VF.createStatement(endIri, RDF.TYPE, FALDO.Position)); - } - tw.handleStatement(VF.createStatement(endIri, RDF.TYPE, FALDO.ExactPosition)); - tw.handleStatement(VF.createStatement(endIri, FALDO.position, VF.createLiteral(end))); - return end; - } - - private void convertSegmentLineToRdf(SegmentLine segmentLine, RDFWriter tw, String nodePrefix) { - String name = segmentLine.getNameAsString(); - IRI nodeIRI = createNodeId(nodePrefix, name); - tw.handleStatement(VF.createStatement(nodeIRI, RDF.TYPE, VG.Node)); - if (extra) { - try { - int id = Integer.parseInt(name); - nodeLengthMapByIntId.put(id, segmentLine.getSequence().length()); - } catch (NumberFormatException e) { - nodeLengthMapByByteArrayId.put(name, segmentLine.getSequence().length()); - } - } - tw.handleStatement(VF.createStatement(nodeIRI, RDF.VALUE, VF.createLiteral(segmentLine.getSequence().asString()))); - } - - private void convertLinkLineToRdf(LinkLine linkLine, RDFWriter tw, String nodePrefix) { - IRI fromNodeIRI = createNodeId(nodePrefix, linkLine.getFromNameAsString()); - IRI toNodeIRI = createNodeId(nodePrefix, linkLine.getToNameAsString()); - if (linkLine.isReverseComplimentOfFrom()) { - if (linkLine.isReverseComplimentOfTo()) { - tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksReverseToReverse, toNodeIRI)); - } else { - tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksReverseToForward, toNodeIRI)); - } - } else { - if (linkLine.isReverseComplimentOfTo()) { - tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksForwardToReverse, toNodeIRI)); - } else { - tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksForwardToForward, toNodeIRI)); - } - } - } - - IRI createNodeId(String prefix, String nodeId) { - return new PrefixedIRI(prefix, base + "node/", nodeId); - } - - private int getNodeLengthOfStep(Step step) { - if (step.nodeHasLongId()) { - return nodeLengthMapByIntId.get((int) step.nodeLongId()); - } else { - return nodeLengthMapByByteArrayId.get(step.nodeId()); - } - - } - - private class PrefixedURITurtleWriter extends TurtleWriter { - - public PrefixedURITurtleWriter(OutputStream out, ParsedIRI piri) { - super(out, piri); - } - - @Override - protected void writeURI(IRI res) throws IOException { - - if (res instanceof PrefixedIRI) { - PrefixedIRI pi = (PrefixedIRI) res; - writer.write(pi.prefix + ':' + pi.localName); - } else { - String prefix = namespaceTable.get(res.stringValue()); - if (prefix != null) { - writer.write(prefix); - writer.write(":"); - return; - } - super.writeURI(res); - } - } - - @Override - protected void writeLiteral(Literal res) throws IOException { - if (res instanceof NumericLiteral) { - String normalized = XMLDatatypeUtil.normalize(res.getLabel(), res.getDatatype()); - switch (normalized) { - case XMLDatatypeUtil.POSITIVE_INFINITY: - case XMLDatatypeUtil.NEGATIVE_INFINITY: - case XMLDatatypeUtil.NaN: - super.writeLiteral(res); - break; - default: - writer.write(normalized); - return; - } - } - super.writeLiteral(res); - - } - - private void unsetNamespace(String iri) { - namespaceTable.remove(iri); - } - } - - private class PrefixedIRI extends SimpleIRI { - - private final String prefix; - private final String namespace; - private final String localName; - - public PrefixedIRI(String prefix, String namespace, String localName) { - this.prefix = prefix; - this.namespace = namespace; - this.localName = localName; - } - - @Override - public String toString() { - return stringValue(); - } - - @Override - public String stringValue() { - return namespace + localName; - } - - @Override - public String getNamespace() { - return namespace; - } - - @Override - public String getLocalName() { - return localName; - } - - // Implements IRI.equals(Object) - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o instanceof IRI) { - - String a = toString(); - String b = o.toString(); - - return a.equals(b); - } - - return false; - } - - // Implements IRI.hashCode() - @Override - public int hashCode() { - return stringValue().hashCode(); - } - } + GFA2RDF() { + + } + + private static final ValueFactory VF = SimpleValueFactory.getInstance(); + private final LongIntHashMap nodeLengthMapByLongId = new LongIntHashMap(); + private final ObjectIntHashMap nodeLengthMapByByteArrayId = new ObjectIntHashMap<>(); + + @Parameters(index = "0", description = "The GFA file to translate to RDF") + private File inputFile; + + @Parameters(index = "1", description = "The output file to write RDF too") + private File outputFile; + + @Option(names = { "-b", "--base" }, description = "Base IRI of this graph") + private String base = "http://example.org/vg/"; + + @Option(names = { "-s", "--short" }, description = "try to generate shorter text, and leave out inferable triples") + boolean preCompress; + + @Option(names = { "-e", "--extra" }, description = "try to all triples possible") + boolean extra; + + @Option(names = { "-f", "--rdf-format" }, description = "Which RDF serialization format to use (mimetype", defaultValue = "text/turtle") + String format = "text/turtle"; + + /** + * Run the actual conversion. + */ + @Override + public Integer call() throws Exception { // your business logic goes here... + ParsedIRI baseIRI = new ParsedIRI(base); + try (OutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile))) { + try (Stream s = Files.lines(inputFile.toPath(), StandardCharsets.US_ASCII)) { + writeConvertedToOutputStream(out, baseIRI, s); + } + } + return 0; + } + + void writeConvertedToOutputStream(final OutputStream out, ParsedIRI baseIRI, Stream s) + throws RDFHandlerException, UnsupportedRDFormatException, URISyntaxException { + RDFFormat rdfformat = RDFFormat.matchMIMEType(format, List.of(RDFFormat.TURTLE, RDFFormat.NTRIPLES, RDFFormat.RDFXML, RDFFormat.BINARY)).orElse(RDFFormat.TURTLE); + RDFWriter tw = Rio.createWriter(rdfformat, out, baseIRI.toString()); + tw.startRDF(); + String nodePrefix; + if (preCompress) { + tw.set(BasicWriterSettings.PRETTY_PRINT, false); + tw.handleNamespace("r", RDF.NAMESPACE); + tw.handleNamespace("", VG.NAMESPACE); + nodePrefix = "n"; + tw.handleNamespace("S", VG.Step.stringValue()); + tw.handleNamespace("N", VG.Node.stringValue()); + tw.handleNamespace("v", RDF.VALUE.stringValue()); + tw.handleNamespace("ff", VG.linksForwardToForward.stringValue()); + tw.handleNamespace("fr", VG.linksForwardToReverse.stringValue()); + tw.handleNamespace("rf", VG.linksReverseToForward.stringValue()); + tw.handleNamespace("rr", VG.linksReverseToReverse.stringValue()); + tw.handleNamespace("l", VG.links.stringValue()); + tw.handleNamespace("sp", VG.path.stringValue()); + tw.handleNamespace("sr", VG.rank.stringValue()); + tw.handleNamespace("sn", VG.node.stringValue()); + tw.handleNamespace("f", FALDO.NAMESPACE); + tw.handleNamespace("ep", FALDO.ExactPosition.stringValue()); + tw.handleNamespace("p", FALDO.position.stringValue()); + } else { + tw.set(BasicWriterSettings.PRETTY_PRINT, true); + tw.handleNamespace(RDF.PREFIX, RDF.NAMESPACE); + tw.handleNamespace(VG.PREFIX, VG.NAMESPACE); + nodePrefix = "node"; + tw.handleNamespace(FALDO.PREFIX, FALDO.NAMESPACE); + } + tw.handleNamespace(nodePrefix, base + "node/"); + Iterator si = s.iterator(); + convert(si, tw, nodePrefix); + + tw.endRDF(); + } + + /** + * Conversion of GFA1 strings to VG style turtle RDF, + * + * @param si a GFA1 file as an iterator of lines. + * @param tw the RDF writer in turtle format. + * @param nodePrefix a prefix to give to nodes (namespacing them) + */ + public void convert(Iterator si, RDFWriter tw, String nodePrefix) { + int pathCounter = 0; + GFA1Reader gfA1Reader = new GFA1Reader(si); + while (gfA1Reader.hasNext()) { + Line line = gfA1Reader.next(); + pathCounter = convertLineToRdf(line, tw, pathCounter, nodePrefix); + } + } + + /** + * Given the arguments convert the GFA1 to RDF. + * + * @param args the default main arguments + */ + public static void main(String[] args) { + int exitCode = new CommandLine(new GFA2RDF()).execute(args); + System.exit(exitCode); + } + + private int convertLineToRdf(Line line, RDFWriter tw, int pathCounter, String nodePrefix) { + switch (line.getCode()) { + case PathLine.CODE: + return convertPathLineToRdf((PathLine) line, tw, pathCounter, nodePrefix); + case SegmentLine.CODE: + convertSegmentLineToRdf((SegmentLine) line, tw, nodePrefix); + return pathCounter; + case LinkLine.CODE: + convertLinkLineToRdf((LinkLine) line, tw, nodePrefix); + return pathCounter; + default: + return pathCounter; + } + } + + private int convertPathLineToRdf(PathLine pathLine, RDFWriter writer, int pathCounter, String nodePrefix) { + String nameAsString = pathLine.getNameAsString(); + IRI pathIRI; + String pathPrefix; + String pathStepPrefix; + String pathPositionPrefix; + if (preCompress) { + pathPrefix = "pn"; + pathStepPrefix = "ps"; + pathPositionPrefix = "pp"; + } else { + pathPrefix = "path"; + pathStepPrefix = "pathstep"; + pathPositionPrefix = "pathposition"; + } + if (nameAsString.startsWith("http://") || nameAsString.startsWith("ftp://") + || nameAsString.startsWith("https://")) { + pathIRI = new PrefixedIRI(pathPrefix, nameAsString, ""); + } else { + pathIRI = new PrefixedIRI(pathPrefix, base + "path/", nameAsString); + } + + String pathStepBase = pathIRI.stringValue() + "/step/"; + String pathPositionBase = pathIRI.stringValue() + "/position/"; + + writer.handleNamespace(pathPrefix, pathIRI.stringValue()); + writer.handleNamespace(pathStepPrefix, pathStepBase); + if (extra) { + writer.handleNamespace(pathPositionPrefix, pathPositionBase); + } + writer.handleStatement(VF.createStatement(pathIRI, RDF.TYPE, VG.Path)); + Iterator steps = pathLine.steps(); + int begin = 1; // We start at position 1. + while (steps.hasNext()) { + Step step = steps.next(); + writeStep(step, pathStepBase, writer, pathIRI, begin, pathPositionBase, nodePrefix, pathPrefix, + pathStepPrefix, pathPositionPrefix); + } + if (writer instanceof PrefixedURITurtleWriter) { + PrefixedURITurtleWriter tw = (PrefixedURITurtleWriter) writer; + tw.unsetNamespace(pathIRI.stringValue()); + tw.unsetNamespace(pathStepBase); + tw.unsetNamespace(pathPositionBase); + } + return pathCounter++; + } + + void writeStep(Step step, String pathStepBase, RDFWriter tw, IRI pathIRI, int begin, String pathPositionBase, + String nodePrefix, String pathPrefix, String pathStepPrefix, String pathPositionPrefix) + throws RDFHandlerException { + long rank = step.rank(); + IRI stepIRI = new PrefixedIRI(pathStepPrefix, pathStepBase, Long.toString(rank)); + if (!preCompress) { + tw.handleStatement(VF.createStatement(stepIRI, RDF.TYPE, VG.Step)); + tw.handleStatement(VF.createStatement(stepIRI, RDF.TYPE, FALDO.Region)); + } + tw.handleStatement(VF.createStatement(stepIRI, VG.path, pathIRI)); + if (rank < Integer.MAX_VALUE) { + tw.handleStatement(VF.createStatement(stepIRI, VG.rank, VF.createLiteral((int) rank))); + } else { + tw.handleStatement(VF.createStatement(stepIRI, VG.rank, VF.createLiteral(rank))); + } + tw.handleStatement( + VF.createStatement(stepIRI, VG.node, createNodeId(nodePrefix, new String(step.nodeId(), US_ASCII)))); + if (extra) { + int end = writePositions(begin, step, pathPositionBase, tw, stepIRI, pathPositionPrefix); + begin = end; + } + } + + int writePositions(int begin, Step step, String pathPositionBase, RDFWriter tw, IRI stepIRI, + String pathPositionPrefix) throws RDFHandlerException { + int end = begin + getNodeLengthOfStep(step); + IRI beginIri = new PrefixedIRI(pathPositionPrefix, pathPositionBase, String.valueOf(begin)); + IRI endIri = new PrefixedIRI(pathPositionPrefix, pathPositionBase, String.valueOf(end)); + tw.handleStatement(VF.createStatement(stepIRI, FALDO.begin, beginIri)); + tw.handleStatement(VF.createStatement(stepIRI, FALDO.end, endIri)); + if (!preCompress) { + tw.handleStatement(VF.createStatement(beginIri, RDF.TYPE, FALDO.Position)); + } + tw.handleStatement(VF.createStatement(beginIri, RDF.TYPE, FALDO.ExactPosition)); + tw.handleStatement(VF.createStatement(beginIri, FALDO.position, VF.createLiteral(begin))); + if (!preCompress) { + tw.handleStatement(VF.createStatement(endIri, RDF.TYPE, FALDO.Position)); + } + tw.handleStatement(VF.createStatement(endIri, RDF.TYPE, FALDO.ExactPosition)); + tw.handleStatement(VF.createStatement(endIri, FALDO.position, VF.createLiteral(end))); + return end; + } + + private void convertSegmentLineToRdf(SegmentLine segmentLine, RDFWriter tw, String nodePrefix) { + String name = segmentLine.getNameAsString(); + IRI nodeIRI = createNodeId(nodePrefix, name); + tw.handleStatement(VF.createStatement(nodeIRI, RDF.TYPE, VG.Node)); + if (extra) { + try { + int id = Integer.parseInt(name); + nodeLengthMapByLongId.put(id, segmentLine.getSequence().length()); + } catch (NumberFormatException e) { + nodeLengthMapByByteArrayId.put(name, segmentLine.getSequence().length()); + } + } + tw.handleStatement( + VF.createStatement(nodeIRI, RDF.VALUE, VF.createLiteral(segmentLine.getSequence().asString()))); + } + + private void convertLinkLineToRdf(LinkLine linkLine, RDFWriter tw, String nodePrefix) { + IRI fromNodeIRI = createNodeId(nodePrefix, linkLine.getFromNameAsString()); + IRI toNodeIRI = createNodeId(nodePrefix, linkLine.getToNameAsString()); + if (linkLine.isReverseComplimentOfFrom()) { + if (linkLine.isReverseComplimentOfTo()) { + tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksReverseToReverse, toNodeIRI)); + } else { + tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksReverseToForward, toNodeIRI)); + } + } else { + if (linkLine.isReverseComplimentOfTo()) { + tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksForwardToReverse, toNodeIRI)); + } else { + tw.handleStatement(VF.createStatement(fromNodeIRI, VG.linksForwardToForward, toNodeIRI)); + } + } + } + + IRI createNodeId(String prefix, String nodeId) { + return new PrefixedIRI(prefix, base + "node/", nodeId); + } + + private int getNodeLengthOfStep(Step step) { + if (step.nodeHasLongId()) { + return nodeLengthMapByLongId.get(step.nodeLongId()); + } else { + return nodeLengthMapByByteArrayId.get(step.nodeId()); + } + + } + + private class PrefixedURITurtleWriter extends TurtleWriter { + + public PrefixedURITurtleWriter(OutputStream out, ParsedIRI piri) { + super(out, piri); + } + + @Override + protected void writeURI(IRI res) throws IOException { + + if (res instanceof PrefixedIRI) { + PrefixedIRI pi = (PrefixedIRI) res; + writer.write(pi.prefix + ':' + pi.localName); + } else { + String prefix = namespaceTable.get(res.stringValue()); + if (prefix != null) { + writer.write(prefix); + writer.write(":"); + return; + } + super.writeURI(res); + } + } + + @Override + protected void writeLiteral(Literal res) throws IOException { + if (res instanceof NumericLiteral) { + String normalized = XMLDatatypeUtil.normalize(res.getLabel(), res.getDatatype()); + switch (normalized) { + case XMLDatatypeUtil.POSITIVE_INFINITY: + case XMLDatatypeUtil.NEGATIVE_INFINITY: + case XMLDatatypeUtil.NaN: + super.writeLiteral(res); + break; + default: + writer.write(normalized); + return; + } + } + super.writeLiteral(res); + + } + + private void unsetNamespace(String iri) { + namespaceTable.remove(iri); + } + } + + private class PrefixedIRI extends SimpleIRI { + + private static final long serialVersionUID = 1L; + private final String prefix; + private final String namespace; + private final String localName; + + public PrefixedIRI(String prefix, String namespace, String localName) { + this.prefix = prefix; + this.namespace = namespace; + this.localName = localName; + } + + @Override + public String toString() { + return stringValue(); + } + + @Override + public String stringValue() { + return namespace + localName; + } + + @Override + public String getNamespace() { + return namespace; + } + + @Override + public String getLocalName() { + return localName; + } + + // Implements IRI.equals(Object) + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof IRI) { + + String a = toString(); + String b = o.toString(); + + return a.equals(b); + } + + return false; + } + + // Implements IRI.hashCode() + @Override + public int hashCode() { + return stringValue().hashCode(); + } + } } diff --git a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/VG.java b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/VG.java index 50ddd2e..82e662c 100644 --- a/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/VG.java +++ b/src/main/java/swiss/sib/swissprot/handlegraph4jrdf/VG.java @@ -1,49 +1,128 @@ /* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. + * The MIT License + * + * Copyright 2020 Jerven Bolleman . + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ package swiss.sib.swissprot.handlegraph4jrdf; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Namespace; -import org.eclipse.rdf4j.model.ValueFactory; import org.eclipse.rdf4j.model.impl.SimpleNamespace; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; -import org.eclipse.rdf4j.model.vocabulary.RDF; /** - * - * @author Jerven Bolleman + * A minimal set of properties and classes that are needed to describe variation + * graphs. + * + * @see The VG Description in OWL + * @author Jerven Bolleman */ public class VG { - /** - * http://biohackathon.org/resource/vg# - */ - public static final String NAMESPACE = "http://biohackathon.org/resource/vg#"; + private VG() { + + } - /** - * Recommended prefix for the VG namespace: "vg" - */ - public static final String PREFIX = "vg"; + /** + * http://biohackathon.org/resource/vg# + */ + public static final String NAMESPACE = "http://biohackathon.org/resource/vg#"; - /** - * An immutable {@link Namespace} constant that represents the VG namespace. - */ - public static final Namespace NS = new SimpleNamespace(PREFIX, NAMESPACE); + /** + * Recommended prefix for the VG namespace: "vg" + */ + public static final String PREFIX = "vg"; - public static final IRI Node = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Node"); - public static final IRI Path = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Path"); - public static final IRI Step = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Step"); - public static final IRI rank = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "rank"); - public static final IRI position = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "position"); - public static final IRI path = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "path"); - public static final IRI linksForwardToForward = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "linksForwardToForward"); - public static final IRI linksForwardToReverse = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "linksForwardToReverse"); - public static final IRI linksReverseToForward = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "linksReverseToForward"); - public static final IRI linksReverseToReverse = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "linksReverseToReverse"); - public static final IRI links = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "links"); - public static final IRI reverseOfNode = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "reverseOfNode"); - public static final IRI node = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "node"); + /** + * An immutable {@link Namespace} constant that represents the VG namespace. + */ + public static final Namespace NS = new SimpleNamespace(PREFIX, NAMESPACE); + /** + * A node in the variant graph, representing a sequence section. + */ + public static final IRI Node = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Node"); + /** + * A Path is a collection of steps from path to path that represent an + * asserdfs:labelmbled sequence integrated into the variant graph. + */ + public static final IRI Path = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Path"); + /** + * A step along a path in the variant graph. A series of steps along a path + * represent an assembled sequence that was originally inserted into the the + * variant graph. A step points to a :Node or the reverse complement of a node + * and has a rank (step number). + */ + public static final IRI Step = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "Step"); + /** + * The rank records the step place along its path. + */ + public static final IRI rank = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "rank"); + /** + * This is the position on the reference path at which this step starts. + */ + public static final IRI position = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "position"); + /** + * This means that this step occurs on the path that is the object of this + * statment + */ + public static final IRI path = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "path"); + /** + * This links a node from the forward (5' to 3') strand on the subject node to + * the forward (5' to 3') strand on the predicate node. + */ + public static final IRI linksForwardToForward = SimpleValueFactory.getInstance().createIRI(NAMESPACE, + "linksForwardToForward"); + /** + * This links a node from the forward (5' to 3') strand on the subject node to + * the reverse (3' to 5') strand on the predicate node. + */ + public static final IRI linksForwardToReverse = SimpleValueFactory.getInstance().createIRI(NAMESPACE, + "linksForwardToReverse"); + /** + * This links a node from the reverse (3' to 5') strand on the subject node to + * the forward (5' to 3') strand on the predicate node. + */ + public static final IRI linksReverseToForward = SimpleValueFactory.getInstance().createIRI(NAMESPACE, + "linksReverseToForward"); + /** + * This links a node from the reverse (3' to 5') strand on the subject node to + * the reverse (3' to 5') strand on the predicate node. + */ + public static final IRI linksReverseToReverse = SimpleValueFactory.getInstance().createIRI(NAMESPACE, + "linksReverseToReverse"); + /** + * The super property that says two nodes are linked and does not allow one to + * infer which side to side it goes. + */ + public static final IRI links = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "links"); + /** + * This means this step occurs on the revese complement of the sequence attaced + * to the node (i.e. it is on the implicit reverse (3' to 5') strand) of the + * predicate node. + */ + public static final IRI reverseOfNode = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "reverseOfNode"); + /** + * This means that this step occurs on the forward strand of the sequence + * attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') + * strand) of the predicate node. + */ + public static final IRI node = SimpleValueFactory.getInstance().createIRI(NAMESPACE, "node"); } diff --git a/src/test/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDFTest.java b/src/test/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDFTest.java index 2feecea..89e0ec5 100644 --- a/src/test/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDFTest.java +++ b/src/test/java/swiss/sib/swissprot/handlegraph4jrdf/GFA2RDFTest.java @@ -1,27 +1,46 @@ /* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. + * The MIT License + * + * Copyright 2020 Jerven Bolleman . + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ package swiss.sib.swissprot.handlegraph4jrdf; -import swiss.sib.swissprot.handlegraph4jrdf.GFA2RDF; +import static org.junit.jupiter.api.Assertions.assertTrue; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.StringReader; import java.net.URISyntaxException; import java.util.Arrays; import java.util.stream.Stream; + import org.eclipse.rdf4j.common.net.ParsedIRI; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFParser; import org.eclipse.rdf4j.rio.Rio; -import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.Test; /** * - * @author Jerven Bolleman + * @author Jerven Bolleman */ public class GFA2RDFTest {