Skip to content

Commit

Permalink
added more mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
Eisenbahnplatte committed Aug 26, 2021
1 parent c8606ff commit 391cd10
Show file tree
Hide file tree
Showing 13 changed files with 81 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ class CompileConfig(val inputFormat: String,
val mapping: String,
val delimiter: Character,
val quotation: Character,
val createMapping: Boolean) {
val createMapping: Boolean,
val graphURI: String,
val outFile: File) {

var sha = ""
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,20 @@ class FileHandler(cliConfig: CLI_Config) {
mapping = cliConfig.mapping(),
delimiter = cliConfig.delimiter().toCharArray.head,
quotation = cliConfig.quotation().toCharArray.head,
createMapping = cliConfig.createMapping()
createMapping = cliConfig.createMapping(),
graphURI = cliConfig.graphURI(),
outFile = getOutputFile(inputFile)
)

val outFile: File = getOutputFile(inputFile)

// Without any Conversion
if ((config.inputCompression == config.outputCompression) && (config.inputFormat == config.outputFormat)) {
copyStream(new FileInputStream(inputFile.toJava), new FileOutputStream(outFile.toJava))
Some(outFile)
copyStream(new FileInputStream(inputFile.toJava), new FileOutputStream(config.outFile.toJava))
Some(config.outFile)
}
// Only Compression Conversion
else if (config.inputCompression != config.outputCompression && (config.inputFormat == config.outputFormat)) {
copyStream(Compressor.decompress(inputFile), Compressor.compress(outFile, config.outputCompression))
Some(outFile)
copyStream(Compressor.decompress(inputFile), Compressor.compress(config.outFile, config.outputCompression))
Some(config.outFile)
}

// File Format Conversion (need to uncompress anyway)
Expand All @@ -77,27 +77,25 @@ class FileHandler(cliConfig: CLI_Config) {
}

if (formatConvertedData.isDirectory){
outFile.createDirectoryIfNotExists()
config.outFile.createDirectoryIfNotExists()
val formatConvertedFiles = formatConvertedData.children
var i = 1
while(formatConvertedFiles.hasNext) {
val formatConvertedFile = formatConvertedFiles.next()
val newOutFile = {
if (config.outputCompression.nonEmpty) outFile / s"$i.${config.outputFormat}.${config.outputCompression}"
else outFile / s"$i.${config.outputFormat}"
if (config.outputCompression.nonEmpty) config.outFile / s"${formatConvertedFile.name}.${config.outputCompression}"
else config.outFile / s"${formatConvertedFile.name}"
}
val compressedOutStream = Compressor.compress(newOutFile, config.outputCompression)
copyStream(new FileInputStream(formatConvertedFile.toJava), compressedOutStream)
i+=1
}
} else {
val compressedOutStream = Compressor.compress(outFile, config.outputCompression)
val compressedOutStream = Compressor.compress(config.outFile, config.outputCompression)
copyStream(new FileInputStream(formatConvertedData.toJava), compressedOutStream)
}

//DELETE TEMPDIR
// if (typeConvertedFile.parent.exists) typeConvertedFile.parent.delete()
Some(outFile)
Some(config.outFile)
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@ import org.dbpedia.databus.client.filehandling.convert.format.rdf.quads.QuadsHan
import org.dbpedia.databus.client.filehandling.{CompileConfig, FileUtil}
import org.dbpedia.databus.client.filehandling.convert.format.tsd.TSDHandler
import org.dbpedia.databus.client.filehandling.convert.format.rdf.triples.TripleHandler
import org.dbpedia.databus.client.filehandling.convert.mapping.{MappingInfo, RDF_Quads_Mapper, RDF_Triples_Mapper, TSD_Mapper}
import org.dbpedia.databus.client.filehandling.convert.mapping.{RDF_Quads_Mapper, RDF_Triples_Mapper, TSD_Mapper}
import org.dbpedia.databus.client.main.CLI_Config
import org.dbpedia.databus.client.sparql.QueryHandler
import org.slf4j.LoggerFactory

import scala.util.control.Breaks.{break, breakable}
import org.apache.jena.graph.Triple
import org.apache.spark.rdd.RDD

import java.net.URLEncoder
/**
* Converter for tsv, csv and several RDF serializations (nt,ttl,rdfxml,json-ld, nq, trix, trig)
*/
Expand Down Expand Up @@ -56,29 +58,28 @@ object FormatConverter {
val tripleHandler = new TripleHandler()

//read process
val triples:Array[RDD[Triple]] = {
if (RDF_TRIPLES.contains(conf.inputFormat)) {
Array(tripleHandler.read(file.pathAsString, conf.inputFormat))
}
if (RDF_QUADS.contains(conf.inputFormat)) {
val quads = new QuadsHandler().read(file.pathAsString, conf.inputFormat)
RDF_Quads_Mapper.map_to_triples(quads)
}
else Array(TSD_Mapper.map_to_triples(file, conf))
}

//write process
if (triples.length>1){
tripleHandler.write(triples.head, conf.outputFormat)
} else {
var i=1
triples.foreach(rdd => {
val convertedFile = tripleHandler.write(rdd, conf.outputFormat)
convertedFile.moveTo(targetTempDir / s"$i.${conf.outputFormat}")
i+=1
if (RDF_QUADS.contains(conf.inputFormat)) {
val quads = new QuadsHandler().read(file.pathAsString, conf.inputFormat)
val triples = RDF_Quads_Mapper.map_to_triples(quads)

triples.foreach(triplesResult => {
val convertedFile = tripleHandler.write(triplesResult.graph, conf.outputFormat)
val outFile = targetTempDir / s"${conf.outFile.nameWithoutExtension}_graph=${URLEncoder.encode(triplesResult.graphName, "UTF-8")}.${conf.outputFormat}"
convertedFile.moveTo(outFile)
})

targetTempDir
} else {
val triples:Array[RDD[Triple]] = {
if (RDF_TRIPLES.contains(conf.inputFormat)) {
Array(tripleHandler.read(file.pathAsString, conf.inputFormat))
}
else { //TSD.contains(conf.inputFormat)
Array(TSD_Mapper.map_to_triples(file, conf))
}
}

tripleHandler.write(triples.head, conf.outputFormat)
}

}
Expand All @@ -89,41 +90,30 @@ object FormatConverter {
//read process
val quads = {
if (RDF_QUADS.contains(conf.inputFormat)) quadsHandler.read(file.pathAsString, conf.inputFormat)
else RDF_Triples_Mapper.map_to_quads(new TripleHandler().read(file.pathAsString, conf.inputFormat))
else RDF_Triples_Mapper.map_to_quads(new TripleHandler().read(file.pathAsString, conf.inputFormat), conf.graphURI)
}

//write process
quadsHandler.write(quads, conf.outputFormat)
}

else { // convert to Tabular structured data (TSD)
val tsdHandler = new TSDHandler(conf.delimiter)

//read process
val data = {
if (TSD.contains(conf.inputFormat)) Array(tsdHandler.read(file.pathAsString, conf.inputFormat))
if (TSD.contains(conf.inputFormat)) tsdHandler.read(file.pathAsString, conf.inputFormat)
else if (RDF_QUADS.contains(conf.inputFormat)) {
val quads = new QuadsHandler().read(file.pathAsString, conf.inputFormat)
RDF_Quads_Mapper.map_to_tsd(quads, conf.createMapping)
}
else { //RDF_TRIPLES.contains(conf.inputFormat)
val triples = new TripleHandler().read(file.pathAsString, conf.inputFormat)
Array(RDF_Triples_Mapper.map_to_tsd(triples, conf.createMapping))
RDF_Triples_Mapper.map_to_tsd(triples, conf.createMapping)
}
}

//write
if (data.length==1){
tsdHandler.write(data.head, conf.outputFormat)
} else {
var i=1
data.foreach(rdd => {
val convertedFile = tsdHandler.write(rdd, conf.outputFormat)
convertedFile.moveTo(targetTempDir / s"$i.${conf.outputFormat}")
i+=1
})

targetTempDir
}
tsdHandler.write(data, conf.outputFormat)
}
}
// FileUtil.unionFiles(tempDir, targetFile)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.dbpedia.databus.client.filehandling.convert.format.tsd.format.{CSV, TSV}
import org.dbpedia.databus.client.filehandling.convert.format.{EquivalenceClassHandler, tsd}
import org.dbpedia.databus.client.filehandling.convert.mapping.MappingInfo
import org.dbpedia.databus.client.filehandling.convert.mapping.util.MappingInfo

/**
* object to handle csv and tsv files
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ package org.dbpedia.databus.client.filehandling.convert.mapping
import org.apache.spark.rdd.RDD
import org.apache.jena.graph.Triple
import org.apache.jena.sparql.core.Quad
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{Column, DataFrame}
import org.dbpedia.databus.client.filehandling.convert.Spark
import org.dbpedia.databus.client.filehandling.convert.mapping.util.TriplesResult

object RDF_Quads_Mapper {

def map_to_triples(data:RDD[Quad]):Array[RDD[Triple]]={
def map_to_triples(data:RDD[Quad]):Array[TriplesResult]={
val graphs = data
.groupBy(quad quad.getGraph)
.map(_._2)
Expand All @@ -17,16 +19,30 @@ object RDF_Quads_Mapper {
graphs.map(iterable => {
var data: Seq[Triple] = Seq.empty
val iterator = iterable.iterator
var graphName = ""
while (iterator.hasNext) {
data = data :+ iterator.next().asTriple()
val quad = iterator.next()
graphName = quad.getGraph.toString
data = data :+ quad.asTriple()
}
Spark.context.parallelize(data)
new TriplesResult(graphName, Spark.context.parallelize(data))
})

}

def map_to_tsd(data:RDD[Quad], createMapping:Boolean):Array[DataFrame]={
def map_to_tsd(data:RDD[Quad], createMapping:Boolean):DataFrame={
val triplesData = map_to_triples(data)
triplesData.map(graph => RDF_Triples_Mapper.map_to_tsd(graph, createMapping))
val dataFrameForEachGraph = triplesData.map(triplesResult => {
val dataFrame = RDF_Triples_Mapper.map_to_tsd(triplesResult.graph, createMapping)
dataFrame.show()
dataFrame.withColumn("graph", lit(triplesResult.graphName))
})

val resultDataFrame = dataFrameForEachGraph.head

dataFrameForEachGraph.foreach()
df1.join(df2, df1.col("column").equalTo(df2("column")))
dataFrameForEachGraph.reduce(_ join _)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ import org.apache.jena.sparql.core.Quad
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.dbpedia.databus.client.filehandling.convert.Spark
import org.dbpedia.databus.client.filehandling.convert.mapping.util.Tarql_Writer

object RDF_Triples_Mapper {

val tempDir:File = File("./target/databus.tmp/temp/")

def map_to_quads(data:RDD[Triple]): RDD[Quad] ={
data.map(triple => Quad.create(NodeFactory.createBlankNode(), triple))
def map_to_quads(data:RDD[Triple], graphName:String): RDD[Quad] = {
if (graphName == "DefaultGraph") data.map(triple => Quad.create(Quad.defaultGraphIRI, triple))
else data.map(triple => Quad.create(NodeFactory.createURI(graphName), triple))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import org.apache.parquet.io.InputFile
import org.dbpedia.databus.client.filehandling.CompileConfig
import org.dbpedia.databus.client.filehandling.convert.Spark
import org.dbpedia.databus.client.filehandling.convert.format.tsd
import org.dbpedia.databus.client.filehandling.convert.mapping.util.MappingInfo
import org.deri.tarql.{CSVOptions, TarqlParser, TarqlQueryExecutionFactory}
import org.slf4j.LoggerFactory

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.dbpedia.databus.client.filehandling.convert.mapping
package org.dbpedia.databus.client.filehandling.convert.mapping.util

class MappingInfo(
val mappingFile: String,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.dbpedia.databus.client.filehandling.convert.mapping
package org.dbpedia.databus.client.filehandling.convert.mapping.util

import better.files.File
import org.apache.spark.sql.DataFrame

import java.io.PrintWriter
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package org.dbpedia.databus.client.filehandling.convert.mapping.util

import org.apache.jena.graph.Triple
import org.apache.spark.rdd.RDD

class TriplesResult(val graphName: String, val graph: RDD[Triple]) {}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ For usage of parameters see below:
val delimiter: ScallopOption[String] = opt[String](default = Some(","), descr = "set the delimiter (only for some formats)")
val quotation: ScallopOption[String] = opt[String](default = Some("\""), descr = "set the quotation (only for some formats)")
val createMapping: ScallopOption[Boolean] = opt[Boolean](default = Some(false), descr = "Do you want to create mapping files for mapped sources?")
val graphURI: ScallopOption[String] = opt[String](default = Some(""), descr = "set the graph uri for mapping from rdf triples to rdf quads")

verify()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import org.apache.jena.JenaRuntime
import org.apache.jena.query._
import org.apache.jena.rdf.model.{Model, ModelFactory}
import org.apache.jena.riot.{RDFDataMgr, RDFLanguages}
import org.dbpedia.databus.client.filehandling.convert.mapping.MappingInfo
import org.dbpedia.databus.client.filehandling.convert.mapping.util.MappingInfo
import org.dbpedia.databus.client.sparql.queries.{DataIdQueries, DatabusQueries, MappingQueries}
import org.slf4j.{Logger, LoggerFactory}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//import org.dbpedia.databus.client.filehandling.convert.format.tsd.TSDHandler
//import org.dbpedia.databus.client.filehandling.convert.format.rdf.triples.TripleHandler
//import org.dbpedia.databus.client.filehandling.download.Downloader
//import org.dbpedia.databus.client.filehandling.convert.mapping.MappingInfo
//import org.dbpedia.databus.client.filehandling.convert.mapping.util.MappingInfo
//import org.dbpedia.databus.client.filehandling.{FileHandler, FileUtil}
//import org.scalatest.FlatSpec
//
Expand Down

0 comments on commit 391cd10

Please sign in to comment.