config/meta_config.yaml

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2016 Silvio Peroni <essepuntato@gmail.com>
# Copyright 2020 Fabio Mariani <fabio.mariani555@gmail.com>
# Copyright 2022 Arcangelo Massari <arcangelo.massari@unibo.it>
#
# Permission to use, copy, modify, and/or distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright notice
# and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

# Mandatory settings

# Endpoint URL to load the output RDF
triplestore_url: http://127.0.0.1:19999/blazegraph/sparql
# A list of triplestore URLs containing provenance metadata
provenance_endpoints: []
# Directory where raw CSV files are stored
input_csv_dir: /srv/meta/meta_input
# The path to the base directory to save all output files
base_output_dir: &base_output_dir /srv/meta/meta_output
# A URI string representing the provenance agent which is considered responsible for the RDF graph manipulation
resp_agent: https://orcid.org/0000-0002-8420-0696

# Optional settings

# Folder where RDF files are saved. Since these files are the heaviest, it is sometimes convenient to save them on HDD, while the triplestore needs to be on SSD for its efficient operation
output_rdf_dir: *base_output_dir
# The base URI of entities on Meta. This setting can be safely left as is
base_iri: https://w3id.org/oc/meta/
# URL where the namespaces and prefixes used in the OpenCitations Data Model are defined. This setting can be safely left as is
context_path: https://w3id.org/oc/corpus/context.json
# Number of files per folder. dir_split_number's value must be multiple of items_per_file's value. This parameter is useful only if you choose to return the output in json-ld format
dir_split_number: 10000
# Number of items per file. This parameter is useful only if you choose to return the output in json-ld format
items_per_file: 1000
# This value is used as the default prefix if no prefix is specified. It is a deprecated parameter, valid only for backward compatibility and can safely be ignored
default_dir: _
# A prefix for the sequential number in entities’ URIs. This setting can be safely left as is
supplier_prefix: '060'
# If True, save all the graphset and provset in one file, and save all the graphset on the triplestore. 
# If False, the graphs are saved according to the usual OpenCitations strategy (the "complex" hierarchy of folders and subfolders for each type of entity)
rdf_output_in_chunks: False
# If True, the folder specified in output_rdf_dir must contain zipped JSON files, and the output will be zipped 
zip_output_rdf: False
# Data source URL. This setting can be safely left as is
source: https://api.crossref.org/
# If True, use the DOI API service to check if DOIs are valid
use_doi_api_service: False
# Number of cores to devote to the Meta process
workers_number: 16
# True if Blazegraph was used as a provenance triplestore, and a textual index was built to speed up queries. For more information, see https://github.com/blazegraph/database/wiki/Rebuild_Text_Index_Procedure
blazegraph_full_text_search: False
# True if Fuseki was used as a provenance triplestore, and a textual index was built to speed up queries. For more information, see https://jena.apache.org/documentation/query/text-query.html
fuseki_full_text_search: False
# True if Virtuoso was used as a provenance triplestore, and a textual index was built to speed up queries. For more information, see https://docs.openlinksw.com/virtuoso/rdfsparqlrulefulltext/
virtuoso_full_text_search: True
# The name of the Lucene connector if GraphDB was used as a provenance triplestore and a textual index was built to speed up queries. For more information, see https://graphdb.ontotext.com/documentation/free/general-full-text-search-with-connectors.html
graphdb_connector_name: ''
# Specifies the triplestore URL to use as a cache to make queries on provenance faster
cache_endpoint: ''
# If your cache provenance triplestore uses different endpoints for reading and writing (e.g. GraphDB), specify the endpoint for writing in this parameter
cache_update_endpoint: ''
# Fields in the silencer list are only updated if there is no information on that field in OpenCitations Meta. For example, if 'author' is specified, any new authors are not added to the list if authors are already present.
silencer: ['author', 'editor', 'publisher']