From 547add98c24da7739700e5b9b5082a55fb19d50b Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@informatik.uni-freiburg.de>
Date: Wed, 4 Dec 2024 22:34:45 +0100
Subject: [PATCH] Easy `MULTI_INPUT_JSON` specification with `for-each` (#95)

So far, specifying the `MULTI_INPUT_JSON` was cumbersome when the input consists of many files that are assigned to few graphs via a pattern. A typical case is when all files in a directory or with a certain prefix should be assigned to the same graph. There is now a nice and simple syntax for this. Here are two example elements of a `MULTI_INPUT_JSON` array, which should be self-explanatory:

```
{ "cmd": "zcat {}", "graph": "http://example.org/graph1", "for-each": "dir1/*.ttl.gz" }
{ "cmd": "zcat {}", "graph": "http://example.org/graph2", "for-each": "dir2/*.ttl.gz" }
```

Update the `Qleverfile` for UniProt to use this new feature (and read the input as gzipped TTL files, and not N-Triples as before).
---
 src/qlever/Qleverfiles/Qleverfile.uniprot | 64 +++++++++++++++++------
 src/qlever/commands/index.py              | 57 ++++++++++++++++----
 2 files changed, 96 insertions(+), 25 deletions(-)
diff --git a/src/qlever/Qleverfiles/Qleverfile.uniprot b/src/qlever/Qleverfiles/Qleverfile.uniprot
index 74169406..aa0c058e 100644
--- a/src/qlever/Qleverfiles/Qleverfile.uniprot
+++ b/src/qlever/Qleverfiles/Qleverfile.uniprot
@@ -1,30 +1,62 @@
 # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
 #
-# qlever get-data  # takes ~ 30 hours and ~ 2 TB of disk (for the NT files)
-# qlever index     # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever get-data  # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files)
+# qlever index     # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X)
 # qlever start     # starts the server (takes a few seconds)
 #
-# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
+# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget
 # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
 #
 # Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB
-# during build, ~ 3 TB after build). The uniprot.index.???.meta files can be on
-# HDD without significant performance loss (when running the server).
+# during build, ~ 3 TB after build).
 
 [data]
-NAME           = uniprot
-DATE           = 2024-05-29
-DOWNLOAD_URL   = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
-GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
-RDFXML2NT_CMD  = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | gzip -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
-GET_DATA_CMD   = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date
-DESCRIPTION    = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}
+NAME             = uniprot
+DATE             = 2024-11-27
+RDFXML_DIR       = rdf.${DATE}
+TTL_DIR          = ttl.${DATE}
+UNIPROT_URL      = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
+RHEA_URL         = https://ftp.expasy.org/databases/rhea/rdf
+EXAMPLES_URL     = https://github.com/sib-swiss/sparql-examples
+GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
+GET_RDFXML_CMD   = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
+RDFXML2TTL_CMD   = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
+GET_DATA_CMD     = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
+DESCRIPTION      = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL}
 
 [index]
-INPUT_FILES     = nt.${data:DATE}/*.nt.gz
-CAT_INPUT_FILES = parallel --tmpdir . -j 4 'zcat -f {}' ::: ${INPUT_FILES} | pv -q -B 5G
-SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
-STXXL_MEMORY    = 60G
+INPUT_FILES      = ${data:TTL_DIR}/*.ttl.gz
+MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" },
+                    { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" },
+                    { "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" },
+                    { "cmd": "zcat ${data:TTL_DIR}/citations_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" },
+                    { "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" },
+                    { "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" },
+                    { "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
+                    { "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
+                    { "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" },
+                    { "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
+                    { "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
+                    { "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
+                    { "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
+                    { "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
+                    { "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" },
+                    { "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
+                    { "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
+                    { "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" },
+                    { "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" },
+                    { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" },
+                    { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" },
+                    { "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }]
+SETTINGS_JSON    = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
+STXXL_MEMORY     = 60G
 
 [server]
 PORT                        = 7018
diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py
index cd4478e1..81babea1 100644
--- a/src/qlever/commands/index.py
+++ b/src/qlever/commands/index.py
@@ -99,13 +99,50 @@ def get_input_options_for_json(self, args) -> str:
                     f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
                     input_spec,
                 )
-            input_cmd = input_spec["cmd"]
+            # If the command contains a `{}` placeholder, we need a `for-each`
+            # key` specifying the pattern for the placeholder values, and vice
+            # versa.
+            if "{}" in input_spec["cmd"] and "for-each" not in input_spec:
+                raise self.InvalidInputJson(
+                    f"Element {i} in `MULTI_INPUT_JSON` must contain a "
+                    "key `for-each` if the command contains a placeholder "
+                    "`{}`",
+                    input_spec,
+                )
+            if "for-each" in input_spec and "{}" not in input_spec["cmd"]:
+                raise self.InvalidInputJson(
+                    f"Element {i} in `MULTI_INPUT_JSON` contains a "
+                    "key `for-each`, but the command does not contain a "
+                    "placeholder `{{}}`",
+                    input_spec,
+                )
+            # Get all commands. This is just the value of the `cmd` key if no
+            # `for-each` key is specified. Otherwise, we have a command for
+            # each file matching the pattern.
+            if "for-each" not in input_spec:
+                input_cmds = [input_spec["cmd"]]
+            else:
+                try:
+                    files = glob.glob(input_spec["for-each"])
+                except Exception as e:
+                    raise self.InvalidInputJson(
+                        f"Element {i} in `MULTI_INPUT_JSON` contains an "
+                        f"invalid `for-each` pattern: {e}",
+                        input_spec,
+                    )
+                input_cmds = [input_spec["cmd"].format(file) for file in files]
             # The `format`, `graph`, and `parallel` keys are optional.
             input_format = input_spec.get("format", args.format)
             input_graph = input_spec.get("graph", "-")
             input_parallel = input_spec.get("parallel", "false")
             # There must not be any other keys.
-            extra_keys = input_spec.keys() - {"cmd", "format", "graph", "parallel"}
+            extra_keys = input_spec.keys() - {
+                "cmd",
+                "format",
+                "graph",
+                "parallel",
+                "for-each",
+            }
             if extra_keys:
                 raise self.InvalidInputJson(
                     f"Element {i} in `MULTI_INPUT_JSON` must only contain "
@@ -114,13 +151,15 @@ def get_input_options_for_json(self, args) -> str:
                     input_spec,
                 )
             # Add the command-line options for this input stream. We use
-            # process substitution `<(...)` as a convenient way to handle
-            # an input stream just like a file. This is not POSIX compliant,
-            # but supported by various shells, including bash and zsh.
-            input_options.append(
-                f"-f <({input_cmd}) -F {input_format} "
-                f'-g "{input_graph}" -p {input_parallel}'
-            )
+            # process substitution `<(...)` as a convenient way to handle an
+            # input stream just like a file. This is not POSIX compliant, but
+            # supported by various shells, including bash and zsh. If
+            # `for-each` is specified, add one command for each matching file.
+            for input_cmd in input_cmds:
+                input_options.append(
+                    f"-f <({input_cmd}) -F {input_format} "
+                    f'-g "{input_graph}" -p {input_parallel}'
+                )
         # Return the concatenated command-line options.
         return " ".join(input_options)