From 547add98c24da7739700e5b9b5082a55fb19d50b Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 4 Dec 2024 22:34:45 +0100 Subject: [PATCH] Easy `MULTI_INPUT_JSON` specification with `for-each` (#95) So far, specifying the `MULTI_INPUT_JSON` was cumbersome when the input consists of many files that are assigned to few graphs via a pattern. A typical case is when all files in a directory or with a certain prefix should be assigned to the same graph. There is now a nice and simple syntax for this. Here are two example elements of a `MULTI_INPUT_JSON` array, which should be self-explanatory: ``` { "cmd": "zcat {}", "graph": "http://example.org/graph1", "for-each": "dir1/*.ttl.gz" } { "cmd": "zcat {}", "graph": "http://example.org/graph2", "for-each": "dir2/*.ttl.gz" } ``` Update the `Qleverfile` for UniProt to use this new feature (and read the input as gzipped TTL files, and not N-Triples as before). --- src/qlever/Qleverfiles/Qleverfile.uniprot | 64 +++++++++++++++++------ src/qlever/commands/index.py | 57 ++++++++++++++++---- 2 files changed, 96 insertions(+), 25 deletions(-) diff --git a/src/qlever/Qleverfiles/Qleverfile.uniprot b/src/qlever/Qleverfiles/Qleverfile.uniprot index 74169406..aa0c058e 100644 --- a/src/qlever/Qleverfiles/Qleverfile.uniprot +++ b/src/qlever/Qleverfiles/Qleverfile.uniprot @@ -1,30 +1,62 @@ # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control # -# qlever get-data # takes ~ 30 hours and ~ 2 TB of disk (for the NT files) -# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 5900X) +# qlever get-data # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files) +# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X) # qlever start # starts the server (takes a few seconds) # -# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv +# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries) # # Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB -# during build, ~ 3 TB after build). The uniprot.index.???.meta files can be on -# HDD without significant performance loss (when running the server). +# during build, ~ 3 TB after build). [data] -NAME = uniprot -DATE = 2024-05-29 -DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf -GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done -RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | gzip -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel -GET_DATA_CMD = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date -DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE} +NAME = uniprot +DATE = 2024-11-27 +RDFXML_DIR = rdf.${DATE} +TTL_DIR = ttl.${DATE} +UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf +RHEA_URL = https://ftp.expasy.org/databases/rhea/rdf +EXAMPLES_URL = https://github.com/sib-swiss/sparql-examples +GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples) +GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done +RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel +GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date +DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL} [index] -INPUT_FILES = nt.${data:DATE}/*.nt.gz -CAT_INPUT_FILES = parallel --tmpdir . -j 4 'zcat -f {}' ::: ${INPUT_FILES} | pv -q -B 5G -SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 } -STXXL_MEMORY = 60G +INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz +MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" }, + { "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" }, + { "cmd": "zcat ${data:TTL_DIR}/citations_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" }, + { "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" }, + { "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" }, + { "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" }, + { "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" }, + { "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" }, + { "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" }, + { "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" }, + { "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" }, + { "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" }, + { "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" }, + { "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" }, + { "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" }, + { "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" }, + { "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" }, + { "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" }, + { "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" }, + { "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" }, + { "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" }, + { "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" }, + { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" }, + { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" }, + { "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }] +SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 } +STXXL_MEMORY = 60G [server] PORT = 7018 diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index cd4478e1..81babea1 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -99,13 +99,50 @@ def get_input_options_for_json(self, args) -> str: f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`", input_spec, ) - input_cmd = input_spec["cmd"] + # If the command contains a `{}` placeholder, we need a `for-each` + # key` specifying the pattern for the placeholder values, and vice + # versa. + if "{}" in input_spec["cmd"] and "for-each" not in input_spec: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` must contain a " + "key `for-each` if the command contains a placeholder " + "`{}`", + input_spec, + ) + if "for-each" in input_spec and "{}" not in input_spec["cmd"]: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` contains a " + "key `for-each`, but the command does not contain a " + "placeholder `{{}}`", + input_spec, + ) + # Get all commands. This is just the value of the `cmd` key if no + # `for-each` key is specified. Otherwise, we have a command for + # each file matching the pattern. + if "for-each" not in input_spec: + input_cmds = [input_spec["cmd"]] + else: + try: + files = glob.glob(input_spec["for-each"]) + except Exception as e: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` contains an " + f"invalid `for-each` pattern: {e}", + input_spec, + ) + input_cmds = [input_spec["cmd"].format(file) for file in files] # The `format`, `graph`, and `parallel` keys are optional. input_format = input_spec.get("format", args.format) input_graph = input_spec.get("graph", "-") input_parallel = input_spec.get("parallel", "false") # There must not be any other keys. - extra_keys = input_spec.keys() - {"cmd", "format", "graph", "parallel"} + extra_keys = input_spec.keys() - { + "cmd", + "format", + "graph", + "parallel", + "for-each", + } if extra_keys: raise self.InvalidInputJson( f"Element {i} in `MULTI_INPUT_JSON` must only contain " @@ -114,13 +151,15 @@ def get_input_options_for_json(self, args) -> str: input_spec, ) # Add the command-line options for this input stream. We use - # process substitution `<(...)` as a convenient way to handle - # an input stream just like a file. This is not POSIX compliant, - # but supported by various shells, including bash and zsh. - input_options.append( - f"-f <({input_cmd}) -F {input_format} " - f'-g "{input_graph}" -p {input_parallel}' - ) + # process substitution `<(...)` as a convenient way to handle an + # input stream just like a file. This is not POSIX compliant, but + # supported by various shells, including bash and zsh. If + # `for-each` is specified, add one command for each matching file. + for input_cmd in input_cmds: + input_options.append( + f"-f <({input_cmd}) -F {input_format} " + f'-g "{input_graph}" -p {input_parallel}' + ) # Return the concatenated command-line options. return " ".join(input_options)