Skip to content

Commit

Permalink
finished mappings (probably)
Browse files Browse the repository at this point in the history
fixed some roundtriptests
  • Loading branch information
eisenbahnplatte committed Aug 27, 2021
1 parent 391cd10 commit 0a980d2
Show file tree
Hide file tree
Showing 53 changed files with 4,945 additions and 1,168 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Example Application Deployment: Download the files of 5 datasets as given in the
## Current State

**beta**:
most of the times it should produce expected results for compression and RDF conversion. Please expect some code refactoring and fluctuation. There will be an open-source licence, either GPL or Apache.
most of the times it should produce expected results for compression and RDF format.conversion. Please expect some code refactoring and fluctuation. There will be an open-source licence, either GPL or Apache.


## Concept
Expand All @@ -28,7 +28,7 @@ The databus-client is designed to unify and convert data on the client-side in s
* Level 1: all features finished, testing required
* Level 2: using Apache Compress library covers most of the compression formats, more testing required
* Level 3: Scalable RDF libraries from [SANSA-Stack](http://sansa-stack.net/) and [Databus Derive](https://github.com/dbpedia/databus-derive). Step by step, extension for all (quasi-)isomorphic [IANA mediatypes](https://www.iana.org/assignments/media-types/media-types.xhtml).
* Level 4: In addition, we plan to provide a plugin mechanism to incorporate more sophisticated mapping engines as [Tarql](https://tarql.github.io/) (already implemented), [RML](http://rml.io), R2RML, [R2R](http://wifo5-03.informatik.uni-mannheim.de/bizer/r2r/) (for owl:equivalence translation) and XSLT.
* Level 4: In addition, we plan to provide a plugin mechanism to incorporate more sophisticated format.mapping engines as [Tarql](https://tarql.github.io/) (already implemented), [RML](http://rml.io), R2RML, [R2R](http://wifo5-03.informatik.uni-mannheim.de/bizer/r2r/) (for owl:equivalence translation) and XSLT.


## Usage
Expand Down
4,597 changes: 4,597 additions & 0 deletions errorLog.log

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
<suites>conversionTests.mapping.roundTripTests, conversionTests.conversion.roundTripTests</suites>
<suites>archived.format.mapping.roundTripTests, format.conversion.format.conversion.roundTripTests</suites>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>WDF TestSuite.txt</filereports>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ object FileUtil {
* @return format
*/
def getFormatType(inputFile: File, compressionInputFile: String): String = {
{
val format ={
try {
if (!(getFormatTypeWithDataID(inputFile) == "")) {
getFormatTypeWithDataID(inputFile)
Expand All @@ -202,6 +202,23 @@ object FileUtil {
case _: FileNotFoundException => getFormatTypeWithoutDataID(inputFile, compressionInputFile)
}
}

if (format == "rdf") "rdfxml"
else format
}

/**
* read a query file as string
*
* @param file query file
* @return query string
*/
def readQueryFile(file: File): String = {
var queryString: String = ""
for (line <- file.lineIterator) {
queryString = queryString.concat(line).concat("\n")
}
queryString
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class SourceHandler(conf:CLI_Config) {
val sourceFile: File = File(conf.source())

if (sourceFile.hasExtension && sourceFile.extension.get.matches(".sparql|.query")) { // conf.source() is a query file
val queryString = readQueryFile(sourceFile)
val queryString = FileUtil.readQueryFile(sourceFile)
handleQuery(queryString)
}
else { // conf.source() is an already existing file or directory
Expand Down Expand Up @@ -173,20 +173,6 @@ class SourceHandler(conf:CLI_Config) {
handler.handleResponse(response)
}

/**
* read a query file as string
*
* @param file query file
* @return query string
*/
def readQueryFile(file: File): String = {
var queryString: String = ""
for (line <- file.lineIterator) {
queryString = queryString.concat(line).concat("\n")
}
queryString
}

def printTask(sourceType: String, source: String, target: String):Unit = {
val str =
s"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory

import scala.util.control.Breaks.{break, breakable}
import org.apache.jena.graph.Triple
import org.apache.jena.sparql.core.Quad
import org.apache.spark.rdd.RDD

import java.net.URLEncoder
Expand Down Expand Up @@ -90,7 +91,8 @@ object FormatConverter {
//read process
val quads = {
if (RDF_QUADS.contains(conf.inputFormat)) quadsHandler.read(file.pathAsString, conf.inputFormat)
else RDF_Triples_Mapper.map_to_quads(new TripleHandler().read(file.pathAsString, conf.inputFormat), conf.graphURI)
else if (RDF_TRIPLES.contains(conf.inputFormat)) RDF_Triples_Mapper.map_to_quads(new TripleHandler().read(file.pathAsString, conf.inputFormat), conf.graphURI)
else Spark.context.emptyRDD[Quad]
}

//write process
Expand All @@ -113,19 +115,8 @@ object FormatConverter {
}
}

//write process
tsdHandler.write(data, conf.outputFormat)
}
}
// FileUtil.unionFiles(tempDir, targetFile)
// if (mappingFile.exists && mappingFile != File("")) {
// val mapDir = File("./mappings/")
// mapDir.createDirectoryIfNotExists()
// mappingFile.moveTo(mapDir / FileUtil.getSha256(targetFile), overwrite = true)
// }
//}
//catch {
// case _: RuntimeException => LoggerFactory.getLogger("UnionFilesLogger").error(s"File $targetFile already exists") //deleteAndRestart(inputFile, inputFormat, outputFormat, targetFile: File)
//}
//
// targetFile
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.dbpedia.databus.client.filehandling.FileUtil
import org.dbpedia.databus.client.filehandling.convert.format.EquivalenceClassHandler
import org.dbpedia.databus.client.filehandling.convert.format.rdf.quads.format.{NQuads, Trig, Trix}
import org.dbpedia.databus.client.filehandling.convert.format.rdf.quads.format.{JsonLD, NQuads, Trig, Trix}

class QuadsHandler extends EquivalenceClassHandler[RDD[Quad]]{

Expand All @@ -23,6 +23,7 @@ class QuadsHandler extends EquivalenceClassHandler[RDD[Quad]]{
case "nq" => new NQuads().read(source)
case "trig" => new Trix().read(source)
case "trix" => new Trig().read(source)
case "jsonld" => new JsonLD().read(source)
}
}

Expand All @@ -38,6 +39,7 @@ class QuadsHandler extends EquivalenceClassHandler[RDD[Quad]]{
case "nq" => new NQuads().write(data)
case "trig" => new Trig().write(data)
case "trix" => new Trix().write(data)
case "jsonld" => new JsonLD().write(data)
}

}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,40 @@
package org.dbpedia.databus.client.filehandling.convert.format.rdf.quads.format

import better.files.File
import org.apache.jena.graph.{NodeFactory, Triple}
import org.apache.jena.rdf.model.{ModelFactory, ResourceFactory}
import org.apache.jena.riot.{RDFDataMgr, RDFFormat}
import org.apache.jena.riot.Lang
import org.apache.jena.sparql.core.Quad
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.dbpedia.databus.client.filehandling.convert.format.Format

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import scala.io.{Codec, Source}
class JsonLD extends Format[RDD[Quad]] {

override def read(source: String): RDD[Quad] = {
new Trix(Lang.JSONLD).read(source)
}

override def write(t: RDD[Quad]): File = {
new Trix(Lang.JSONLD).write(t)
}
}
//
//import better.files.File
//import org.apache.jena.graph.{NodeFactory, Triple}
//import org.apache.jena.rdf.model.{ModelFactory, ResourceFactory}
//import org.apache.jena.riot.{RDFDataMgr, RDFFormat}
//import org.apache.jena.sparql.core.Quad
//import org.apache.spark.SparkContext
//import org.apache.spark.rdd.RDD
//import org.apache.spark.sql.SparkSession
//import org.dbpedia.databus.client.filehandling.convert.format.Format
//
//import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
//import scala.io.{Codec, Source}

//class JsonLD extends Format[RDD[Quad]] {
//
//// def readJSONL(spark: SparkSession, inputFile: File): RDD[Triple] = {
//// val sc = spark.sparkContext
//// val data = sc.textFile(inputFile.pathAsString)
// val sc = spark.sparkContext
// val data = sc.textFile(inputFile.pathAsString)
//// var tripleRDD = sc.emptyRDD[Triple]
////
//// // data.foreach(println(_))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TripleHandler extends EquivalenceClassHandler[RDD[Triple]] {

inputFormat match {
case "nt" => new NTriples().read(source)
case "rdf" => new RDFXML().read(source)
case "rdfxml" => new RDFXML().read(source)
case "ttl" =>
//wie geht das besser?
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package org.dbpedia.databus.client.filehandling.convert.mapping
import org.apache.spark.rdd.RDD
import org.apache.jena.graph.Triple
import org.apache.jena.sparql.core.Quad
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql.{Column, DataFrame}
import org.dbpedia.databus.client.filehandling.convert.Spark
import org.dbpedia.databus.client.filehandling.convert.mapping.util.TriplesResult
Expand Down Expand Up @@ -31,18 +31,29 @@ object RDF_Quads_Mapper {
}

def map_to_tsd(data:RDD[Quad], createMapping:Boolean):DataFrame={
//calculate partly results
val triplesData = map_to_triples(data)
val dataFrameForEachGraph = triplesData.map(triplesResult => {
val dataFrame = RDF_Triples_Mapper.map_to_tsd(triplesResult.graph, createMapping)
dataFrame.show()
dataFrame.withColumn("graph", lit(triplesResult.graphName))
})

val resultDataFrame = dataFrameForEachGraph.head
//join partly results
var resultDataFrame = dataFrameForEachGraph.head

dataFrameForEachGraph.foreach()
df1.join(df2, df1.col("column").equalTo(df2("column")))
dataFrameForEachGraph.reduce(_ join _)
dataFrameForEachGraph.foreach(df => {
var columns = Seq.empty[String]
resultDataFrame.columns.foreach(col => {
if (df.columns.contains(col)) columns = columns:+col
})
resultDataFrame=resultDataFrame.join(df, columns, "outer")
})

//sort DataFrame
val columns = resultDataFrame.columns
val graphColIndex = columns.indexOf("graph")
val cols = columns.updated(graphColIndex, columns.head).updated(0, "graph").toSeq
resultDataFrame.select(cols.map(x=>col(x)):_*).sort("graph")
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ object RDF_Triples_Mapper {
* converts RDF data (RDD[Triple] to TSD data [DataFrame]
*
* @param inData RDF input data
* @param createMappingFile create a mapping file for conversion back to RDF
* @param createMappingFile create a format.mapping file for format.conversion back to RDF
* @return tabular structured data
*/
def triplesToTSD(inData: RDD[Triple], createMappingFile: Boolean): Seq[DataFrame] = {

val triplesGroupedBySubject = inData.groupBy(triple triple.getSubject).map(_._2)
val allPredicates = inData.groupBy(triple => triple.getPredicate.getURI).map(_._1)

val prefixPre = "xxx" //for mapping file
val prefixPre = "xxx" //for format.mapping file

val mappedPredicates =
Seq(Seq("resource")) ++ allPredicates.map(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ object TSD_Mapper {
}


def map_to_quads()={

}
// def map_to_quads()={
//
// }
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ For usage of parameters see below:
val target: ScallopOption[String] = opt[String](default = Some("./files/"), descr = "set the target directory for converted files")
val overwrite: ScallopOption[Boolean] = opt[Boolean](default = Some(false), descr = "true -> overwrite files in cache, false -> use cache")
val clear: ScallopOption[Boolean] = opt[Boolean](default = Some(false), noshort= true, descr = "true -> clear Cache")
val mapping: ScallopOption[String] = opt[String](default = Some(""), descr = "set the mapping file for conversion to different format equivalence class")
val mapping: ScallopOption[String] = opt[String](default = Some(""), descr = "set the format.mapping file for format.conversion to different format equivalence class")
val delimiter: ScallopOption[String] = opt[String](default = Some(","), descr = "set the delimiter (only for some formats)")
val quotation: ScallopOption[String] = opt[String](default = Some("\""), descr = "set the quotation (only for some formats)")
val createMapping: ScallopOption[Boolean] = opt[Boolean](default = Some(false), descr = "Do you want to create mapping files for mapped sources?")
val graphURI: ScallopOption[String] = opt[String](default = Some(""), descr = "set the graph uri for mapping from rdf triples to rdf quads")
val createMapping: ScallopOption[Boolean] = opt[Boolean](default = Some(false), descr = "Do you want to create format.mapping files for mapped sources?")
val graphURI: ScallopOption[String] = opt[String](default = Some(""), descr = "set the graph uri for format.mapping from rdf triples to rdf quads")

verify()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ object DatabusQueries {
|PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
|PREFIX dcat: <http://www.w3.org/ns/dcat#>
|PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|PREFIX mapping: <
|PREFIX format.mapping: <
|
|SELECT DISTINCT ?mapping
|SELECT DISTINCT ?format.mapping
|WHERE {
| ?dataIdElement dataid:sha256sum "$sha"^^xsd:string .
| ?dataIdElement dataid:file ?file .
| ?mapping <http://tmp-namespace.org/databusFixRequired> ?file .
| ?format.mapping <http://tmp-namespace.org/databusFixRequired> ?file .
|}
|""".stripMargin

Expand All @@ -66,7 +66,7 @@ object DatabusQueries {
|PREFIX map: <http://databus-client.tools.dbpedia.org/vocab/mapping/>
|PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|SELECT DISTINCT ?mapping
|SELECT DISTINCT ?format.mapping
|WHERE {
| ?dataIdElement dataid:sha256sum "$sha"^^xsd:string ;
| dataid:file ?file ;
Expand Down Expand Up @@ -105,7 +105,7 @@ object DatabusQueries {
| FILTER (?from <= ?version && ?until >= ?version)
|
|
| BIND( coalesce(?mapping1, ?mapping2, ?mapping3) as ?mapping)
| BIND( coalesce(?mapping1, ?mapping2, ?mapping3) as ?format.mapping)
|}
|""".stripMargin

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ object MappingQueries {
s"""
|PREFIX map: <http://databus-client.tools.dbpedia.org/vocab/mapping/>
|
|SELECT DISTINCT ?mapping
|SELECT DISTINCT ?format.mapping
|WHERE {
|?mapping a map:MappingFile .
|<$mappingInfoFile> map:hasMappingFile ?mapping .
|?format.mapping a map:MappingFile .
|<$mappingInfoFile> map:hasMappingFile ?format.mapping .
|}
|""".stripMargin

Expand All @@ -19,11 +19,11 @@ object MappingQueries {
|
|SELECT DISTINCT *
|WHERE {
|?mapping a map:MappingFile .
|?format.mapping a map:MappingFile .
|
|<$mappingInfoFile> map:hasDelimiter ?delimiter ;
| map:hasQuotation ?quotation ;
| map:hasMappingFile ?mapping .
| map:hasMappingFile ?format.mapping .
|}
|""".stripMargin
}
Loading

0 comments on commit 0a980d2

Please sign in to comment.