spark { # propagates to the corresponding Spark setting local.dir = "/tmp/spark-tmp" eventLog.enabled = true # the amount of partitions to generally split datasets into before serialisations # (this should usually be the same number as the processing threads available) defaultReadWritePartitions = 12 # the amount of partitions to geneally split datasets into for the computations # (Spark books and guides usually recommend to use 2-4 times the amount of processing threads available) defaultProcessingPartitions = 24 } preprocessing { ##### Filters ##### # list of namespaces defined as IRI prefixes that restricts the IRIs occuring in source data in subject and object # position to receive SIDs (hence, will be replaced with a global identifier IRI later) namespaceWhitelist = ["http://musicbrainz.org","http://d-nb.info/gnd/","http://sws.geonames.org","http://viaf.org"] # namespaceWhitelist = ["http://sws.geonames.org", "http://d-nb.info", "http://data.bibliotheken.nl", # "https://permid.org", "http://wikidata.dbpedia.org", "http://dbpedia.org", "http://de.dbpedia.org", # "http://nl.dbpedia.org", "http://sv.dbpedia.org", "http://fr.dbpedia.org", "http://diffbot.com", # "http://viaf.org", "http://www.wikidata.org", "http://musicbrainz.org/"] # IRI prefixes listed in this specified file will be added to `namespaceWhitelist` as well # this can be convenient for a long whitelist namespaceBlacklistFile = "./src/main/resources/blacklist.txt" namespaceWhitelistFile = "./src/main/resources/whitelist.txt" # list of property IRIs that will exempt subject IRIs, i.e. if a statement with subject ?s and a predicate ?p that is # part of this blacklist appears in the input, the IRI of ?s will not receive an SID subjectIriPropertyBlacklist = [] # list of property IRIs that will exempt object IRIs, i.e. if a statement with object ?o and a predicate ?p that is # part of this blacklist appears in the input, the IRI of ?o will not receive an SID objectIriPropertyBlacklist = ["http://www.w3.org/1999/02/22-rdf-syntax-ns#type"] # list of property IRIs that will exempt subject and object IRIs, for each statement with a predicate ?p that is # part of this blacklist, the IRIs of its subject and object will not receive SIDs subjectObjectIriPropertyBlacklist = ["http://www.w3.org/2000/01/rdf-schema#subClassOf"] # list of RDF types whose instances should be excluded from processing: all IRI-resources for which an rdf:type # assignment to one of the listed types exists in the corresponding dataset input will be excluded from processing # more concretely all triples from the corresponding dataset with such an resource IRI as subject will be ignored rdfTypeBlacklists = { "dnb": ["http://d-nb.info/standards/elementset/gnd#UndifferentiatedPerson"] } } ids { ##### Essential Settings ###### # directory of source datasets (cf. README) sourceDataDir = "/data/id-management/input/" # directory where the individual tasks with write their results into and that downstream tasks will use to search # for results from tasks they depend on intermediateResultsDir = "/data/id-management/intermediate-results" # final results to be submitted to the databus will be saved into this folder. persistenceDir = "/data/id-management/persistence" ##### Behaviour flags ###### # if false, ignore the TSV file of previously assigned SIDs and reassign SIDs for all local IRIs from scratch loadSingletonIds = true # perform additional computations steps after the assignment of (additional) SIDs to ensure that the # local IRI <-> SID mapping is bijective checkSingletonIdsBijective = true } dev { ##### Settings for development/debugging ###### # When enabled, the `SparkSession` of a Spark program will not close immediately when an exception occurs. Instead, # there will be a prompt waiting for a newline to be entered before the closing of the session (and thus cleanup) # occurs. This can be useful to inspect temporary files created by Spark for debugging. inspectSparkSessionOnError = true # filters for the input files for parse-to-binary - to only read inputs under the listed bundles/stacks // inputFileBundleFilter = ["mappings"] # filters for the input files for parse-to-binary - to only read inputs files of (compressed) size <= k MiB // inputFileSizeLimit = 4 # filters for the input files for parse-to-binary - to only read inputs files of (compressed) size >= k MiB // inputFileSizeMinimum = 2 # cap for the input files for parse-to-binary - read no more than k inputs files // maxInputFiles = 6 }