Merge pull request #96 from casework/release-0.9.0

Release 0.9.0
casework · Dec 8, 2022 · 102c9a0 · 102c9a0
2 parents 4123bff + be829b1
commit 102c9a0
Show file tree

Hide file tree

Showing 61 changed files with 787 additions and 73 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,10 @@
+# This file lists the contributors responsible for the
+# repository content. They will also be automatically
+# asked to review any pull request made in this repository.
+
+# Each line is a file pattern followed by one or more owners.
+# The sequence matters: later patterns take precedence.
+
+# FILES  OWNERS
+*        @casework/maintainers-global
+*        @casework/maintainers-case-python-utilities
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,10 +1,10 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 22.10.0
     hooks:
       - id: black
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 5.0.4
     hooks:
       - id: flake8
   - repo: https://github.com/pycqa/isort

diff --git a/case_utils/__init__.py b/case_utils/__init__.py
@@ -11,6 +11,6 @@
 #
 # We would appreciate acknowledgement if the software is used.
 
-__version__ = "0.8.0"
+__version__ = "0.9.0"
 
 from . import local_uuid  # noqa: F401
diff --git a/case_utils/case_sparql_select/__init__.py b/case_utils/case_sparql_select/__init__.py
@@ -26,13 +26,14 @@
 Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
 """
 
-__version__ = "0.4.4"
+__version__ = "0.5.0"
 
 import argparse
 import binascii
 import logging
 import os
 import sys
+import typing
 
 import pandas as pd  # type: ignore
 import rdflib.plugins.sparql
@@ -48,69 +49,44 @@
 _logger = logging.getLogger(os.path.basename(__file__))
 
 
-def main() -> None:
-    parser = argparse.ArgumentParser()
-
-    # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
-    logging.basicConfig(
-        level=logging.DEBUG
-        if ("--debug" in sys.argv or "-d" in sys.argv)
-        else logging.INFO
-    )
-
-    parser.add_argument("-d", "--debug", action="store_true")
-    parser.add_argument(
-        "--built-version",
-        choices=tuple(built_version_choices_list),
-        default="case-" + CURRENT_CASE_VERSION,
-        help="Ontology version to use to supplement query, such as for subclass querying.  Does not require networking to use.  Default is most recent CASE release.  Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
-    )
-    parser.add_argument(
-        "--disallow-empty-results",
-        action="store_true",
-        help="Raise error if no results are returned for query.",
-    )
-    parser.add_argument(
-        "out_table",
-        help="Expected extensions are .html for HTML tables or .md for Markdown tables.",
-    )
-    parser.add_argument(
-        "in_sparql",
-        help="File containing a SPARQL SELECT query.  Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
-    )
-    parser.add_argument("in_graph", nargs="+")
-    args = parser.parse_args()
+def query_text_to_variables(select_query_text: str) -> typing.List[str]:
+    # Build columns list from SELECT line.
+    select_query_text_lines = select_query_text.split("\n")
+    select_line = [
+        line for line in select_query_text_lines if line.startswith("SELECT ")
+    ][0]
+    variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
+    return variables
 
-    graph = rdflib.Graph()
-    for in_graph_filename in args.in_graph:
-        graph.parse(in_graph_filename)
 
+def graph_and_query_to_data_frame(
+    graph: rdflib.Graph,
+    select_query_text: str,
+    *args: typing.Any,
+    built_version: str = "case-" + CURRENT_CASE_VERSION,
+    disallow_empty_results: bool = False,
+    use_prefixes: bool = False,
+    **kwargs: typing.Any,
+) -> pd.DataFrame:
     # Inherit prefixes defined in input context dictionary.
     nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()}
 
-    select_query_text = None
-    with open(args.in_sparql, "r") as in_fh:
-        select_query_text = in_fh.read().strip()
-    _logger.debug("select_query_text = %r." % select_query_text)
-
+    # Avoid side-effects on input parameter.
     if "subClassOf" in select_query_text:
-        case_utils.ontology.load_subclass_hierarchy(
-            graph, built_version=args.built_version
-        )
+        _graph = rdflib.Graph()
+        _graph += graph
+        case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version)
+    else:
+        _graph = graph
 
-    # Build columns list from SELECT line.
-    select_query_text_lines = select_query_text.split("\n")
-    select_line = [
-        line for line in select_query_text_lines if line.startswith("SELECT ")
-    ][0]
-    variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ")
+    variables = query_text_to_variables(select_query_text)
 
     tally = 0
     records = []
     select_query_object = rdflib.plugins.sparql.processor.prepareQuery(
         select_query_text, initNs=nsdict
     )
-    for (row_no, row) in enumerate(graph.query(select_query_object)):
+    for (row_no, row) in enumerate(_graph.query(select_query_object)):
         tally = row_no + 1
         record = []
         for (column_no, column) in enumerate(row):
@@ -124,35 +100,241 @@ def main() -> None:
                 # The render to ASCII is in support of this script rendering results for website viewing.
                 # .decode() is because hexlify returns bytes.
                 column_value = binascii.hexlify(column.toPython()).decode()
+            elif isinstance(column, rdflib.URIRef):
+                if use_prefixes:
+                    column_value = graph.namespace_manager.qname(column.toPython())
+                else:
+                    column_value = column.toPython()
             else:
                 column_value = column.toPython()
             if row_no == 0:
                 _logger.debug("row[0]column[%d] = %r." % (column_no, column_value))
             record.append(column_value)
         records.append(record)
+
     if tally == 0:
-        if args.disallow_empty_results:
+        if disallow_empty_results:
             raise ValueError("Failed to return any results.")
 
     df = pd.DataFrame(records, columns=variables)
+    return df
+
+
+def data_frame_to_table_text(
+    df: pd.DataFrame,
+    *args: typing.Any,
+    json_indent: typing.Optional[int] = None,
+    json_orient: str,
+    output_mode: str,
+    use_header: bool,
+    use_index: bool,
+    **kwargs: typing.Any,
+) -> str:
+    table_text: typing.Optional[str] = None
 
-    table_text = None
-    if args.out_table.endswith(".html"):
+    # Set up kwargs dicts.  One kwarg behaves slightly differently for Markdown vs. other formats.
+    general_kwargs: typing.Dict[str, typing.Any] = dict()
+    md_kwargs: typing.Dict[str, typing.Any] = dict()
+
+    # Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
+    if use_header:
+        general_kwargs["header"] = True
+    else:
+        general_kwargs["header"] = False
+        md_kwargs["headers"] = tuple()
+
+    general_kwargs["index"] = use_index
+
+    if output_mode in {"csv", "tsv"}:
+        sep: str
+        if output_mode == "csv":
+            sep = ","
+        elif output_mode == "tsv":
+            sep = "\t"
+        else:
+            raise NotImplementedError(
+                "Output extension not implemented in CSV-style output."
+            )
+        table_text = df.to_csv(sep=sep, **general_kwargs)
+    elif output_mode == "html":
         # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
         # Add CSS classes for CASE website Bootstrap support.
-        table_text = df.to_html(classes=("table", "table-bordered", "table-condensed"))
-    elif args.out_table.endswith(".md"):
+        table_text = df.to_html(
+            classes=("table", "table-bordered", "table-condensed"), **general_kwargs
+        )
+    elif output_mode == "json":
+        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
+
+        # Drop unsupported kwarg.
+        del general_kwargs["header"]
+
+        table_text = df.to_json(
+            indent=json_indent, orient=json_orient, date_format="iso", **general_kwargs
+        )
+    elif output_mode == "md":
         # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
         # https://pypi.org/project/tabulate/
         # Assume Github-flavored Markdown.
-        table_text = df.to_markdown(tablefmt="github")
-    if table_text is None:
-        raise NotImplementedError(
-            "Unsupported output extension for output filename %r.", args.out_table
+
+        # Drop unsupported kwarg.
+        del general_kwargs["header"]
+
+        table_text = df.to_markdown(tablefmt="github", **general_kwargs, **md_kwargs)
+    else:
+        if table_text is None:
+            raise NotImplementedError("Unimplemented output mode: %r." % output_mode)
+    assert table_text is not None
+
+    return table_text
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+
+    # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
+    logging.basicConfig(
+        level=logging.DEBUG
+        if ("--debug" in sys.argv or "-d" in sys.argv)
+        else logging.INFO
+    )
+
+    parser.add_argument("-d", "--debug", action="store_true")
+    parser.add_argument(
+        "--built-version",
+        choices=tuple(built_version_choices_list),
+        default="case-" + CURRENT_CASE_VERSION,
+        help="Ontology version to use to supplement query, such as for subclass querying.  Does not require networking to use.  Default is most recent CASE release.  Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.",
+    )
+    parser.add_argument(
+        "--disallow-empty-results",
+        action="store_true",
+        help="Raise error if no results are returned for query.",
+    )
+    parser.add_argument(
+        "--json-indent",
+        type=int,
+        help="Number of whitespace characters to use for indentation.  Only applicable for JSON output.",
+    )
+    parser.add_argument(
+        "--json-orient",
+        default="columns",
+        choices=("columns", "index", "records", "split", "table", "values"),
+        help="Orientation to use for Pandas DataFrame JSON output.  Only applicable for JSON output.",
+    )
+    parser.add_argument(
+        "--use-prefixes",
+        action="store_true",
+        help="Abbreviate node IDs according to graph's encoded prefixes.  (This will use prefixes in the graph, not the query.)",
+    )
+    parser.add_argument(
+        "out_table",
+        help="Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.  Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD.",
+    )
+    parser.add_argument(
+        "in_sparql",
+        help="File containing a SPARQL SELECT query.  Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.",
+    )
+
+    parser_header_group = parser.add_mutually_exclusive_group(required=False)
+    parser_header_group.add_argument(
+        "--header",
+        action="store_true",
+        help="Print column labels.  This is the default behavior.",
+    )
+    parser_header_group.add_argument(
+        "--no-header",
+        action="store_true",
+        help="Do not print column labels.",
+    )
+
+    parser_index_group = parser.add_mutually_exclusive_group(required=False)
+    parser_index_group.add_argument(
+        "--index",
+        action="store_true",
+        help="Print index (auto-incrementing row labels as left untitled column).  This is the default behavior.",
+    )
+    parser_index_group.add_argument(
+        "--no-index",
+        action="store_true",
+        help="Do not print index.  If output is JSON, --json-orient must be 'split' or 'table'.",
+    )
+
+    parser.add_argument("in_graph", nargs="+")
+    args = parser.parse_args()
+
+    output_mode: str
+    if args.out_table.endswith(".csv"):
+        output_mode = "csv"
+    elif args.out_table.endswith(".html"):
+        output_mode = "html"
+    elif args.out_table.endswith(".json"):
+        output_mode = "json"
+    elif args.out_table.endswith(".md"):
+        output_mode = "md"
+    elif args.out_table.endswith(".tsv"):
+        output_mode = "tsv"
+    else:
+        raise NotImplementedError("Output file extension not implemented.")
+
+    graph = rdflib.Graph()
+    for in_graph_filename in args.in_graph:
+        graph.parse(in_graph_filename)
+
+    select_query_text: typing.Optional[str] = None
+    with open(args.in_sparql, "r") as in_fh:
+        select_query_text = in_fh.read().strip()
+    if select_query_text is None:
+        raise ValueError("Failed to load query.")
+    _logger.debug("select_query_text = %r." % select_query_text)
+
+    # Process --header and --no-header.
+    use_header: bool
+    if args.header is True:
+        use_header = True
+    if args.no_header is True:
+        use_header = False
+    else:
+        use_header = True
+
+    # Process --index and --no-index.
+    use_index: bool
+    if args.index is True:
+        use_index = True
+    if args.no_index is True:
+        use_index = False
+    else:
+        use_index = True
+
+    if (
+        output_mode == "json"
+        and use_index is False
+        and args.json_orient not in {"split", "table"}
+    ):
+        raise ValueError(
+            "For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
         )
 
+    df = graph_and_query_to_data_frame(
+        graph,
+        select_query_text,
+        built_version=args.built_version,
+        disallow_empty_results=args.disallow_empty_results is True,
+        use_prefixes=args.use_prefixes is True,
+    )
+
+    table_text = data_frame_to_table_text(
+        df,
+        json_indent=args.json_indent,
+        json_orient=args.json_orient,
+        output_mode=output_mode,
+        use_header=use_header,
+        use_index=use_index,
+    )
     with open(args.out_table, "w") as out_fh:
         out_fh.write(table_text)
+        if table_text[-1] != "\n":
+            # End file with newline.  CSV and TSV modes end with a built-in newline.
+            out_fh.write("\n")
 
 
 if __name__ == "__main__":