diff --git a/crate_anon/anonymise/anonregex.py b/crate_anon/anonymise/anonregex.py
index 78f99505..830bc7d5 100755
--- a/crate_anon/anonymise/anonregex.py
+++ b/crate_anon/anonymise/anonregex.py
@@ -527,7 +527,7 @@ def get_uk_postcode_regex_elements(
See:
- https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes
- """ # noqa
+ """ # noqa: E501
# -------------------------------------------------------------------------
# Old
# -------------------------------------------------------------------------
diff --git a/crate_anon/anonymise/config.py b/crate_anon/anonymise/config.py
index c5e5a4ce..c19ea4e7 100644
--- a/crate_anon/anonymise/config.py
+++ b/crate_anon/anonymise/config.py
@@ -456,7 +456,7 @@ def get_word_alternatives(filenames: List[str]) -> List[List[str]]:
Returns:
a list of lists of equivalent words
- """ # noqa
+ """ # noqa: E501
alternatives = [] # type: List[List[str]]
all_words_seen = set() # type: Set[str]
for filename in filenames:
diff --git a/crate_anon/anonymise/constants.py b/crate_anon/anonymise/constants.py
index f3b06798..f7c7117f 100644
--- a/crate_anon/anonymise/constants.py
+++ b/crate_anon/anonymise/constants.py
@@ -198,7 +198,7 @@ class SrcFlag(StrEnum):
# MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html
SQLSERVER_MAX_IDENTIFIER_LENGTH = 128
# Microsoft SQL Server: 128 --
-# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa
+# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa: E501
# =============================================================================
@@ -862,7 +862,7 @@ class HashConfigKeys:
{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
-""" # noqa
+""" # noqa: E501
# For the style:
# [source_databases]
diff --git a/crate_anon/anonymise/dd.py b/crate_anon/anonymise/dd.py
index f6bbfba5..d62b715e 100644
--- a/crate_anon/anonymise/dd.py
+++ b/crate_anon/anonymise/dd.py
@@ -571,7 +571,7 @@ def tidy_draft(self) -> None:
CREATE TABLE junk (intthing INT PRIMARY KEY, text1 LONGTEXT, text2 LONGTEXT);
ALTER TABLE junk ADD FULLTEXT INDEX ftidx1 (text1);
ALTER TABLE junk ADD FULLTEXT INDEX ftidx2 (text2); -- OK
- """ # noqa
+ """ # noqa: E501
log.info("Tidying/correcting draft data dictionary")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/crate_anon/anonymise/ddr.py b/crate_anon/anonymise/ddr.py
index 1ebe2925..edd1dfcc 100644
--- a/crate_anon/anonymise/ddr.py
+++ b/crate_anon/anonymise/ddr.py
@@ -1045,7 +1045,7 @@ def dest_sqla_column(self) -> Column:
kwargs = {
"doc": comment, # Python side
"comment": comment, # SQL side; supported from SQLAlchemy 1.2:
- # https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa
+ # https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa: E501
}
if self.pk:
kwargs["primary_key"] = True
diff --git a/crate_anon/anonymise/launch_multiprocess_anonymiser.py b/crate_anon/anonymise/launch_multiprocess_anonymiser.py
index 6719ad44..33759f31 100755
--- a/crate_anon/anonymise/launch_multiprocess_anonymiser.py
+++ b/crate_anon/anonymise/launch_multiprocess_anonymiser.py
@@ -41,7 +41,7 @@
- https://stackoverflow.com/questions/23611396/python-execute-cat-subprocess-in-parallel
- https://stackoverflow.com/questions/320232/ensuring-subprocesses-are-dead-on-exiting-python-program
- https://stackoverflow.com/questions/641420/how-should-i-log-while-using-multiprocessing-in-python
-""" # noqa
+""" # noqa: E501
import argparse
import logging
diff --git a/crate_anon/anonymise/models.py b/crate_anon/anonymise/models.py
index 32fb2e2c..c580f895 100644
--- a/crate_anon/anonymise/models.py
+++ b/crate_anon/anonymise/models.py
@@ -35,7 +35,7 @@
To create a SQLAlchemy ORM programmatically:
- https://stackoverflow.com/questions/2574105/sqlalchemy-dynamic-mapping/2575016#2575016
-""" # noqa
+""" # noqa: E501
import logging
import random
diff --git a/crate_anon/common/bugfix_flashtext.py b/crate_anon/common/bugfix_flashtext.py
index 0c60d11d..f5eee77d 100644
--- a/crate_anon/common/bugfix_flashtext.py
+++ b/crate_anon/common/bugfix_flashtext.py
@@ -54,13 +54,13 @@ def replace_keywords(self, a_sentence: str) -> str:
if not self.case_sensitive:
sentence = a_sentence.lower()
# by Ihor Bobak:
- # some letters can expand in size when lower() is called, therefore we will preprocess # noqa
- # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa
- # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa
+ # some letters can expand in size when lower() is called, therefore we will preprocess # noqa: E501
+ # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa: E501
+ # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa: E501
# then for the string ABCXABC we want to get
# ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to
- # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa
- # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa
+ # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa: E501
+ # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa: E501
orig_sentence = []
for i in range(0, len(a_sentence)):
char = a_sentence[i]
diff --git a/crate_anon/common/constants.py b/crate_anon/common/constants.py
index 705a191c..bbea601a 100644
--- a/crate_anon/common/constants.py
+++ b/crate_anon/common/constants.py
@@ -52,6 +52,8 @@
# Is this program running on readthedocs.org?
ON_READTHEDOCS = os.environ.get("READTHEDOCS") == "True"
+NoneType = type(None) # for isinstance, sometimes
+
# =============================================================================
# Constant creation
@@ -111,7 +113,7 @@ class DockerConstants:
HOST = "0.0.0.0"
# ... not "localhost" or "127.0.0.1"; see
- # https://nickjanetakis.com/blog/docker-tip-54-fixing-connection-reset-by-peer-or-similar-errors # noqa
+ # https://nickjanetakis.com/blog/docker-tip-54-fixing-connection-reset-by-peer-or-similar-errors # noqa: E501
# =============================================================================
diff --git a/crate_anon/common/extendedconfigparser.py b/crate_anon/common/extendedconfigparser.py
index a29d53b9..de6e81ee 100644
--- a/crate_anon/common/extendedconfigparser.py
+++ b/crate_anon/common/extendedconfigparser.py
@@ -30,6 +30,7 @@
import ast
import configparser
import logging
+import os.path
from typing import (
Any,
Dict,
@@ -148,7 +149,7 @@ def __init__(self, *args, case_sensitive: bool = False, **kwargs) -> None:
# 'converters': Python 3.5 and up
super().__init__(*args, **kwargs)
if case_sensitive:
- # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser # noqa
+ # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser # noqa: E501
self.optionxform = str
# Use the underlying ConfigParser class for e.g.
@@ -520,6 +521,10 @@ def __init__(
self.parser = ExtendedConfigParser(case_sensitive=case_sensitive)
if filename:
log.info(f"Reading config file: {filename}")
+ if not os.path.isfile(filename):
+ raise RuntimeError(
+ f"Config file {filename} does not exist"
+ )
self.parser.read(filename, encoding=encoding)
else:
self.parser.read_file(fileobj)
diff --git a/crate_anon/common/memsize.py b/crate_anon/common/memsize.py
index 129caf82..601f7cc4 100644
--- a/crate_anon/common/memsize.py
+++ b/crate_anon/common/memsize.py
@@ -27,7 +27,7 @@
From https://stackoverflow.com/questions/449560/how-do-i-determine-the-size-of-an-object-in-python
-""" # noqa
+""" # noqa: E501
from gc import get_referents
from sys import getsizeof
@@ -52,7 +52,7 @@ def getsize(obj: Any, assume_none_denied: bool = False) -> int:
Skip checks for classes/modules/functions. Assume that all objects
should be checked (typically, meaning that the caller guarantees
not to pass stuff that doesn't need checking).
- """ # noqa
+ """ # noqa: E501
if not assume_none_denied:
if isinstance(obj, DENYLIST):
raise TypeError(
diff --git a/crate_anon/common/regex_helpers.py b/crate_anon/common/regex_helpers.py
index 82d9e94e..3f8df720 100644
--- a/crate_anon/common/regex_helpers.py
+++ b/crate_anon/common/regex_helpers.py
@@ -91,7 +91,7 @@
_NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$")
_NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$")
-# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa
+# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa: E501
# =============================================================================
diff --git a/crate_anon/common/spreadsheet.py b/crate_anon/common/spreadsheet.py
index 99a8dad3..ec473aa4 100644
--- a/crate_anon/common/spreadsheet.py
+++ b/crate_anon/common/spreadsheet.py
@@ -289,7 +289,7 @@ def write_spreadsheet(
ext = filetype or os.path.splitext(filename)[1]
if filename == "-" or ext == SpreadsheetFileExtensions.TSV.value:
first_key = next(iter(data))
- # https://stackoverflow.com/questions/30362391/how-do-you-find-the-first-key-in-a-dictionary # noqa
+ # https://stackoverflow.com/questions/30362391/how-do-you-find-the-first-key-in-a-dictionary # noqa: E501
first_sheet = data[first_key]
write_tsv(filename, first_sheet)
elif ext == SpreadsheetFileExtensions.CSV.value:
diff --git a/crate_anon/common/sql.py b/crate_anon/common/sql.py
index da651430..afd300db 100644
--- a/crate_anon/common/sql.py
+++ b/crate_anon/common/sql.py
@@ -151,9 +151,9 @@
MSSQL_COLTYPE_TO_LEN = {
# The "N" prefix means Unicode.
- # https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa
- # https://docs.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver15 # noqa
- # https://docs.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver15 # noqa
+ # https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa: E501
+ # https://docs.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver15 # noqa: E501
+ # https://docs.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver15 # noqa: E501
"NVARCHAR_MAX": 2**30 - 1,
# Can specify NVARCHAR(1) to NVARCHAR(4000), or NVARCHAR(MAX) for 2^30 - 1.
"VARCHAR_MAX": 2**31 - 1,
@@ -222,7 +222,7 @@ class SchemaId:
- https://stackoverflow.com/questions/11618277/difference-between-schema-database-in-mysql
- """ # noqa
+ """ # noqa: E501
def __init__(self, db: str = "", schema: str = "") -> None:
"""
@@ -2452,7 +2452,7 @@ def translate_sql_qmark_to_percent(sql: str) -> str:
:class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` objects, so that
the visual appearance matches what they expect from their database.
- """ # noqa
+ """ # noqa: E501
# 1. Escape % characters
sql = escape_percent_for_python_dbapi(sql)
# 2. Replace ? characters that are not within quotes with %s.
@@ -2466,3 +2466,29 @@ def translate_sql_qmark_to_percent(sql: str) -> str:
else:
newsql += c
return newsql
+
+
+def decorate_index_name(
+ idxname: str, tablename: str = None, engine: Engine = None
+) -> str:
+ """
+ Amend the name of a database index. Specifically, this is because SQLite
+ (which we won't use much, but do use for testing!) won't accept two indexes
+ with the same names applying to different tables.
+
+ Args:
+ idxname:
+ The original index name.
+ tablename:
+ The name of the table.
+ engine:
+ The SQLAlchemy engine, from which we obtain the dialect.
+
+ Returns:
+ The index name, amended if necessary.
+ """
+ if not tablename or not engine:
+ return idxname
+ if engine.dialect.name == "sqlite":
+ return f"{idxname}_{tablename}"
+ return idxname
diff --git a/crate_anon/crateweb/config/settings.py b/crate_anon/crateweb/config/settings.py
index 8f189819..2df0909a 100644
--- a/crate_anon/crateweb/config/settings.py
+++ b/crate_anon/crateweb/config/settings.py
@@ -58,7 +58,7 @@
UrlNames,
)
-# https://stackoverflow.com/questions/2636536/how-to-make-django-work-with-unsupported-mysql-drivers-such-as-gevent-mysql-or-c # noqa
+# https://stackoverflow.com/questions/2636536/how-to-make-django-work-with-unsupported-mysql-drivers-such-as-gevent-mysql-or-c # noqa: E501
try:
import pymysql
@@ -115,10 +115,10 @@
# 'kombu.transport.django', # for Celery with Django database as broker
# 'template_profiler_panel',
# 'silk',
- "crate_anon.crateweb.config.apps.UserProfileAppConfig", # for user-specific settings # noqa
- "crate_anon.crateweb.config.apps.ResearchAppConfig", # the research database query app # noqa
- "crate_anon.crateweb.config.apps.ConsentAppConfig", # the consent-to-contact app # noqa
- "crate_anon.crateweb.config.apps.CoreAppConfig", # for e.g. the runcpserver command # noqa
+ "crate_anon.crateweb.config.apps.UserProfileAppConfig", # for user-specific settings # noqa: E501
+ "crate_anon.crateweb.config.apps.ResearchAppConfig", # the research database query app # noqa: E501
+ "crate_anon.crateweb.config.apps.ConsentAppConfig", # the consent-to-contact app # noqa: E501
+ "crate_anon.crateweb.config.apps.CoreAppConfig", # for e.g. the runcpserver command # noqa: E501
"crate_anon.crateweb.config.apps.ApiConfig", # for the anonymisation API
)
@@ -130,7 +130,7 @@
# ... reinstated here 2017-01-30 (django-debug-toolbar==1.6)
# ... "as early as possible... but after any other middle that encodes the
# response's content, such as GZipMiddleware"
- # ... http://django-debug-toolbar.readthedocs.io/en/1.0/installation.html#explicit-setup # noqa
+ # ... http://django-debug-toolbar.readthedocs.io/en/1.0/installation.html#explicit-setup # noqa: E501
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
@@ -140,11 +140,11 @@
"django.middleware.clickjacking.XFrameOptionsMiddleware",
"django.middleware.security.SecurityMiddleware",
# Additional:
- "cardinal_pythonlib.django.middleware.UserBasedExceptionMiddleware", # provide debugging details to superusers # noqa
- "cardinal_pythonlib.django.middleware.LoginRequiredMiddleware", # prohibit all pages except login pages if not logged in # noqa
- # 'cardinal_pythonlib.django.middleware.DisableClientSideCachingMiddleware', # no client-side caching # noqa
- "crate_anon.crateweb.core.middleware.RestrictAdminMiddleware", # non-developers can't access the devadmin site # noqa
- # 'cardinal_pythonlib.django.request_cache.RequestCacheMiddleware', # per-request cache, UNTESTED # noqa
+ "cardinal_pythonlib.django.middleware.UserBasedExceptionMiddleware", # provide debugging details to superusers # noqa: E501
+ "cardinal_pythonlib.django.middleware.LoginRequiredMiddleware", # prohibit all pages except login pages if not logged in # noqa: E501
+ # 'cardinal_pythonlib.django.middleware.DisableClientSideCachingMiddleware', # no client-side caching # noqa: E501
+ "crate_anon.crateweb.core.middleware.RestrictAdminMiddleware", # non-developers can't access the devadmin site # noqa: E501
+ # 'cardinal_pythonlib.django.request_cache.RequestCacheMiddleware', # per-request cache, UNTESTED # noqa: E501
)
LOGIN_URL = "/login/" # for LoginRequiredMiddleware
@@ -166,7 +166,7 @@
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
- "crate_anon.crateweb.core.context_processors.common_context", # noqa
+ "crate_anon.crateweb.core.context_processors.common_context",
],
"loaders": [
# https://docs.djangoproject.com/en/1.9/ref/templates/api/
@@ -206,9 +206,9 @@
"debug_toolbar.panels.logging.LoggingPanel",
"debug_toolbar.panels.redirects.RedirectsPanel",
# Built in but not enabled as standard:
- # 'debug_toolbar.panels.profiling.ProfilingPanel', # EXTREME DANGER! Breaks middleware inc. LoginRequiredMiddleware! # noqa
+ # 'debug_toolbar.panels.profiling.ProfilingPanel', # EXTREME DANGER! Breaks middleware inc. LoginRequiredMiddleware! # noqa: E501
# Extra:
- # 'template_profiler_panel.panels.template.TemplateProfilerPanel', # removed 2017-01-31; division by zero error # noqa
+ # 'template_profiler_panel.panels.template.TemplateProfilerPanel', # removed 2017-01-31; division by zero error # noqa: E501
]
@@ -223,7 +223,7 @@
CELERY_TASK_SERIALIZER = "json"
# Results are OPTIONAL. The CRATE web service doesn't use them.
# But may be helpful for Celery testing.
-# See http://docs.celeryproject.org/en/latest/configuration.html#std:setting-CELERY_RESULT_BACKEND # noqa
+# See http://docs.celeryproject.org/en/latest/configuration.html#std:setting-CELERY_RESULT_BACKEND # noqa: E501
CELERY_RESULT_BACKEND = "rpc://" # uses AMQP
CELERY_RESULT_PERSISTENT = False
diff --git a/crate_anon/crateweb/consent/lookup_rio.py b/crate_anon/crateweb/consent/lookup_rio.py
index eb816871..de26d95c 100644
--- a/crate_anon/crateweb/consent/lookup_rio.py
+++ b/crate_anon/crateweb/consent/lookup_rio.py
@@ -977,7 +977,7 @@ def get_latest_consent_mode_from_rio_generic(
po.NHSNumber = %s -- string comparison
ORDER BY
cr.AssessmentDate DESC
- """ # noqa
+ """ # noqa: E501
# BEWARE "%s" IN SQL COMMENTS! The database backend will crash because
# the number of substituted parameters will be wrong.
# New as of 2018-06-28:
@@ -999,7 +999,7 @@ def get_latest_consent_mode_from_rio_generic(
cr.NHSNumber = %s -- string comparison
ORDER BY
cr.AssessmentDate DESC
- """ # noqa
+ """ # noqa: E501
else:
assert False, "Internal bug" # makes type checker happy
diff --git a/crate_anon/crateweb/consent/tasks.py b/crate_anon/crateweb/consent/tasks.py
index 6e17203e..8aaa5f9a 100644
--- a/crate_anon/crateweb/consent/tasks.py
+++ b/crate_anon/crateweb/consent/tasks.py
@@ -97,7 +97,7 @@
Requires Django 1.9. As of 2015-11-21, that means 1.9rc1
-""" # noqa
+""" # noqa: E501
import logging
from celery import shared_task
@@ -195,7 +195,7 @@ def finalize_clinician_response(clinician_response_id: int) -> None:
Args:
clinician_response_id: PK of the clinician response
- """ # noqa
+ """ # noqa: E501
from crate_anon.crateweb.consent.models import (
ClinicianResponse,
) # delayed import
diff --git a/crate_anon/crateweb/core/management/commands/runcpserver.py b/crate_anon/crateweb/core/management/commands/runcpserver.py
index ba620c3e..89002089 100755
--- a/crate_anon/crateweb/core/management/commands/runcpserver.py
+++ b/crate_anon/crateweb/core/management/commands/runcpserver.py
@@ -42,7 +42,7 @@
./manage.py runcpserver --port 8080 --ssl_certificate /etc/ssl/certs/ssl-cert-snakeoil.pem --ssl_private_key /etc/ssl/private/ssl-cert-snakeoil.key
-""" # noqa
+""" # noqa: E501
from argparse import ArgumentParser, Namespace
import logging
diff --git a/crate_anon/crateweb/core/utils.py b/crate_anon/crateweb/core/utils.py
index bb146bc1..3d92e177 100644
--- a/crate_anon/crateweb/core/utils.py
+++ b/crate_anon/crateweb/core/utils.py
@@ -237,7 +237,7 @@ def site_absolute_url(path: str) -> str:
But that does at least mean we can use the same method for static and
Django URLs.
- """ # noqa
+ """ # noqa: E501
url = settings.DJANGO_SITE_ROOT_ABSOLUTE_URL + path
log.debug(f"site_absolute_url: {path} -> {url}")
return url
@@ -467,7 +467,7 @@ class JavascriptTree(JavascriptTreeNode):
print(t.js_str_html())
print(t.js_data())
- """ # noqa
+ """ # noqa: E501
def __init__(
self,
diff --git a/crate_anon/crateweb/research/html_functions.py b/crate_anon/crateweb/research/html_functions.py
index a6a80bce..4199332a 100644
--- a/crate_anon/crateweb/research/html_functions.py
+++ b/crate_anon/crateweb/research/html_functions.py
@@ -93,7 +93,7 @@ def visibility_button(
{title_html}
{eltype}>
- """ # noqa
+ """ # noqa: E501
def visibility_contentdiv(
diff --git a/crate_anon/crateweb/research/migrations/0007_sitewidequery.py b/crate_anon/crateweb/research/migrations/0007_sitewidequery.py
index 04d9f377..b407d921 100644
--- a/crate_anon/crateweb/research/migrations/0007_sitewidequery.py
+++ b/crate_anon/crateweb/research/migrations/0007_sitewidequery.py
@@ -53,20 +53,20 @@ class Migration(migrations.Migration):
"64-bit non-cryptographic hash of SQL query"
)
),
- ), # nopep8
+ ),
(
"args",
JsonClassField(
null=True, verbose_name="SQL arguments (as JSON)"
),
- ), # nopep8
+ ),
(
"raw",
models.BooleanField(
default=False,
verbose_name="SQL is raw, not parameter-substituted",
),
- ), # nopep8
+ ),
(
"qmark",
models.BooleanField(
@@ -76,7 +76,7 @@ class Migration(migrations.Migration):
" placeholders"
),
),
- ), # nopep8
+ ),
("created", models.DateTimeField(auto_now_add=True)),
(
"deleted",
@@ -87,13 +87,13 @@ class Migration(migrations.Migration):
" queries are never properly deleted."
),
),
- ), # nopep8
+ ),
(
"description",
models.TextField(
default="", verbose_name="query description"
),
- ), # nopep8
+ ),
],
options={
"abstract": False,
diff --git a/crate_anon/crateweb/research/migrations/0008_query_display.py b/crate_anon/crateweb/research/migrations/0008_query_display.py
index 2de287e4..64faf09c 100644
--- a/crate_anon/crateweb/research/migrations/0008_query_display.py
+++ b/crate_anon/crateweb/research/migrations/0008_query_display.py
@@ -46,6 +46,6 @@ class Migration(migrations.Migration):
field=models.TextField(
default="[]",
verbose_name="Subset of output columns to be displayed",
- ), # nopep8
+ ),
),
]
diff --git a/crate_anon/crateweb/research/migrations/0009_query_no_null.py b/crate_anon/crateweb/research/migrations/0009_query_no_null.py
index b48487d3..3eab05ec 100644
--- a/crate_anon/crateweb/research/migrations/0009_query_no_null.py
+++ b/crate_anon/crateweb/research/migrations/0009_query_no_null.py
@@ -16,6 +16,6 @@ class Migration(migrations.Migration):
field=models.BooleanField(
default=False,
verbose_name="Omit Null columns for this query when displayed",
- ), # nopep8
+ ),
),
]
diff --git a/crate_anon/crateweb/research/models.py b/crate_anon/crateweb/research/models.py
index fbb76d18..95187f47 100644
--- a/crate_anon/crateweb/research/models.py
+++ b/crate_anon/crateweb/research/models.py
@@ -171,7 +171,7 @@ def hack_django_pyodbc_azure_cursorwrapper() -> None:
SettingsKeys.DISABLE_DJANGO_PYODBC_AZURE_CURSOR_FETCHONE_NEXTSET,
True,
):
- # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa
+ # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa: E501
hack_django_pyodbc_azure_cursorwrapper()
@@ -278,7 +278,7 @@ def get_executed_researchdb_cursor(
from crate_anon.crateweb.research.models import *
c = get_executed_researchdb_cursor("SELECT 1")
- """ # noqa
+ """ # noqa: E501
args = args or []
cursor = connections[
RESEARCH_DB_CONNECTION_NAME
@@ -372,7 +372,7 @@ def gen_excel_row_elements(
converting to ``str`` in the encoding of the worksheet's choice.)
"""
# Docstring must be a raw string for Sphinx! See
- # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa
+ # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa: E501
for element in row:
if isinstance(element, bytes):
# Convert to str using the worksheet's encoding.
@@ -621,7 +621,7 @@ def save(self, *args, update_fields=None, **kwargs) -> None:
Custom save method. Ensures that only one :class:`Query` has ``active
== True`` for a given user. Also sets the hash.
"""
- # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa
+ # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa: E501
if self.active:
Query.objects.filter(user=self.user, active=True).update(
active=False
@@ -1252,7 +1252,7 @@ def __str__(self) -> str:
# # https://docs.djangoproject.com/en/1.8/topics/db/multi-db/
# # https://newcircle.com/s/post/1242/django_multiple_database_support
# # noinspection PyMethodMayBeStatic,PyUnusedLocal
-# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa
+# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa: E501
# """
# read model PidLookup -> look at database secret
# """
@@ -1269,7 +1269,7 @@ def __str__(self) -> str:
# # 2017-02-12, to address bug:
# # - https://code.djangoproject.com/ticket/27054
# # See also:
-# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa
+# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa: E501
# return db == 'default'
@@ -1311,7 +1311,7 @@ class Meta:
managed = False
db_table = PatientInfoConstants.SECRET_MAP_TABLENAME
- # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa
+ # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa: E501
def save(self, *args, **kwargs) -> None:
return
@@ -1535,7 +1535,7 @@ class PatientMultiQuery:
- ... within
- """ # noqa
+ """ # noqa: E501
def __init__(
self,
@@ -1690,7 +1690,7 @@ def set_override_query(self, query: str) -> None:
AND anonymous_output.patient.nhshash IS NOT NULL
ORDER BY _mrid
- """ # noqa
+ """ # noqa: E501
self._manual_patient_id_query = query
def _get_select_mrid_column(self) -> Optional[ColumnId]:
diff --git a/crate_anon/crateweb/research/research_db_info.py b/crate_anon/crateweb/research/research_db_info.py
index 1455e952..849ddd57 100644
--- a/crate_anon/crateweb/research/research_db_info.py
+++ b/crate_anon/crateweb/research/research_db_info.py
@@ -554,7 +554,7 @@ def _schema_query_microsoft(
- ``sys.fulltext_catalogs``: https://msdn.microsoft.com/en-us/library/ms188779.aspx
- ``sys.fulltext_index_columns``: https://msdn.microsoft.com/en-us/library/ms188335.aspx
- """ # noqa
+ """ # noqa: E501
if not schema_names:
raise ValueError(
"No schema_names specified (for SQL Server " "database)"
diff --git a/crate_anon/crateweb/research/views.py b/crate_anon/crateweb/research/views.py
index 937aed34..0035d401 100644
--- a/crate_anon/crateweb/research/views.py
+++ b/crate_anon/crateweb/research/views.py
@@ -156,6 +156,16 @@
get_patients_per_page,
UserProfile,
)
+from crate_anon.nlp_manager.constants import (
+ # Fieldnames for CRATE NLP table
+ FN_NLPDEF,
+ FN_SRCDB,
+ FN_SRCFIELD,
+ FN_SRCPKFIELD,
+ FN_SRCPKSTR,
+ FN_SRCPKVAL,
+ FN_SRCTABLE,
+)
log = BraceStyleAdapter(logging.getLogger(__name__))
@@ -173,16 +183,6 @@
MPID_PREFIX = "~mpid"
-# Fieldnames for CRATE NLP table
-FN_NLPDEF = "_nlpdef"
-FN_SRCDB = "_srcdb"
-FN_SRCTABLE = "_srctable"
-FN_SRCFIELD = "_srcfield"
-FN_SRCPKFIELD = "_srcpkfield"
-FN_SRCPKVAL = "_srcpkval"
-FN_SRCPKSTR = "_srcpkstr"
-
-
# =============================================================================
# Helper functions
# =============================================================================
@@ -1596,7 +1596,7 @@ def resultset_html_table(
Returns:
str: HTML
- """ # noqa
+ """ # noqa: E501
# Considered but not implemented: hiding table columns
# ... see esp "tr > *:nth-child(n)" at
# https://stackoverflow.com/questions/5440657/how-to-hide-columns-in-html-table # noqa
@@ -1729,7 +1729,7 @@ def single_record_html_table(
Returns:
str: HTML
- """ # noqa
+ """ # noqa: E501
table_html = ""
if FN_NLPDEF in fieldnames:
srcdb_ind = srctable_ind = srcfield_ind = None
@@ -2410,7 +2410,7 @@ def render_lookup(
Returns:
a :class:`django.http.response.HttpResponse`
- """ # noqa
+ """ # noqa: E501
# if not request.user.superuser:
# return HttpResponse('Forbidden', status=403)
# # https://stackoverflow.com/questions/3297048/403-forbidden-vs-401-unauthorized-http-responses # noqa
@@ -2896,7 +2896,7 @@ def common_find_text(
Returns:
a :class:`django.http.response.HttpResponse`
- """ # noqa
+ """ # noqa: E501
# When you forget about Django forms, go back to:
# http://www.slideshare.net/pydanny/advanced-django-forms-usage
diff --git a/crate_anon/linkage/comparison.py b/crate_anon/linkage/comparison.py
index 26f44d09..c321ba40 100644
--- a/crate_anon/linkage/comparison.py
+++ b/crate_anon/linkage/comparison.py
@@ -272,7 +272,7 @@ def __init__(
:math:`P(D | \neg H) = P(\text{match given different person}) = p_f`.
If no match:
:math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`.
- """ # noqa
+ """ # noqa: E501
super().__init__()
self.match = match
self.p_match_given_same_person = p_match_given_same_person
diff --git a/crate_anon/linkage/constants.py b/crate_anon/linkage/constants.py
index 2f084388..ff44d358 100644
--- a/crate_anon/linkage/constants.py
+++ b/crate_anon/linkage/constants.py
@@ -70,7 +70,7 @@ def _mk_dictstr(x: Dict[str, float]) -> str:
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
UK_MEAN_OA_POPULATION_2011 = 309 # not used any more! Left here for interest.
-# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa
+# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa: E501
UK_POPULATION_2017 = 66040000 # 2017 figure, 66.04m
CAMBS_PBORO_POPULATION_2018 = 852523
diff --git a/crate_anon/linkage/helpers.py b/crate_anon/linkage/helpers.py
index 42903e05..2e15e57e 100644
--- a/crate_anon/linkage/helpers.py
+++ b/crate_anon/linkage/helpers.py
@@ -143,7 +143,7 @@
- this allows you to look up *from* output area *to* postcode sector, implying
that postcode sectors must be larger.
-""" # noqa
+""" # noqa: E501
# =============================================================================
@@ -179,7 +179,7 @@
>>> dmeta("Clérambault") # raises UnicodeEncodeError
-""" # noqa
+""" # noqa: E501
# =============================================================================
@@ -501,7 +501,7 @@ def get_postcode_sector(
# (*) [2] uses "Z99 3CZ" (page 6); [1, 3] use "ZZ99 3CZ".
))
PSEUDO_POSTCODE_SECTORS = set(get_postcode_sector(p) for p in PSEUDO_POSTCODES)
-""" # noqa
+""" # noqa: E501
PSEUDO_POSTCODE_START = "ZZ99"
PSEUDOPOSTCODE_NFA = "ZZ993VZ" # no fixed abode
@@ -806,7 +806,7 @@ def mk_blurry_dates(d: Union[Date, str]) -> Tuple[str, str, str]:
Returns MONTH_DAY, YEAR_DAY, and YEAR_MONTH versions in a standard form.
"""
# ISO format is %Y-%m-%d; see
- # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa
+ # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa: E501
# Here we want the shortest full representation; these are not intended to
# be human-legible.
d = coerce_to_pendulum_date(d)
diff --git a/crate_anon/linkage/matchconfig.py b/crate_anon/linkage/matchconfig.py
index a68f761a..62d6c2d9 100644
--- a/crate_anon/linkage/matchconfig.py
+++ b/crate_anon/linkage/matchconfig.py
@@ -678,7 +678,7 @@ def mk_p_c_dict(
simplify(second_stage_speedup)
# = b + 647 / 16
- """ # noqa
+ """ # noqa: E501
if verbose:
log.debug(f"... MatchConfig built. Settings: {self}")
diff --git a/crate_anon/linkage/people.py b/crate_anon/linkage/people.py
index 3ca8f4cb..2cd9a509 100644
--- a/crate_anon/linkage/people.py
+++ b/crate_anon/linkage/people.py
@@ -287,7 +287,7 @@ def get_unique_match_detailed(self, proband: Person) -> MatchResult:
# 2020-04-25: Do this in one pass.
# A bit like
- # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa
+ # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa: E501
# ... but modified, as that fails to deal with joint winners
# ... and it's not a super algorithm anyway.
diff --git a/crate_anon/linkage/validation/entropy_names.py b/crate_anon/linkage/validation/entropy_names.py
index 11a979f8..3e21046a 100755
--- a/crate_anon/linkage/validation/entropy_names.py
+++ b/crate_anon/linkage/validation/entropy_names.py
@@ -126,7 +126,7 @@
.. code-block:: none
inf(A | B) = -log2[P(A | B)] [8], from [5]
-
+
= -log2{ P(A) * pev(A, B) } from [4a]
= -{ log2[P(A)] + log2[pev(A, B)] }
= -log2[P(A)] - log2[pev(A, B)]
@@ -219,15 +219,15 @@
the other. The "iev" concept above is about pairs of individual events.
For two discrete RVs,
-
- I(X; Y) = sum_y{ sum_x{ P_XY(x, y) log[ P_XY(x, y) / (P_X(x) * P_Y(y)) ] }}
+
+ I(X; Y) = sum_y{ sum_x{ P_XY(x, y) log[ P_XY(x, y) / (P_X(x) * P_Y(y)) ] }}
- Mutual information is a consideration across events. The individual-event
- version is "pointwise mutual information",
+ version is "pointwise mutual information",
https://en.wikipedia.org/wiki/Pointwise_mutual_information, which is
-
+
.. code-block:: none
-
+
pmi(x; y) = log[ P(x, y) / (P(x) * P(y) ]
= log[ P(x | y) / P(x) ]
= log[ P(y | x) / P(y) ]
@@ -251,7 +251,7 @@
.. code-block:: none
- ln(x) = log2(x) * ln(2)
+ ln(x) = log2(x) * ln(2)
log2(x) = ln(x) * log2(e)
A partial match would provide a log likelihood of
@@ -279,7 +279,7 @@
Code largely abandoned; not re-checked since NameFrequencyInfo was refactored,
since this code had served it purpose.
-""" # noqa
+""" # noqa: E501
# =============================================================================
@@ -986,7 +986,7 @@ def show_partial_match_frequencies() -> None:
P(share metaphone, not first two char or name) = 0.0014685292425002856
P(share first two char, not metaphone or name) = 0.014894287604562073
-""" # noqa
+""" # noqa: E501
# =============================================================================
diff --git a/crate_anon/linkage/validation/validate_fuzzy_linkage.py b/crate_anon/linkage/validation/validate_fuzzy_linkage.py
index adc4937e..45c36e6e 100755
--- a/crate_anon/linkage/validation/validate_fuzzy_linkage.py
+++ b/crate_anon/linkage/validation/validate_fuzzy_linkage.py
@@ -1090,7 +1090,7 @@ def validate_2_fetch_cdl(
-- Final count: 152888 (on 2022-05-26).
-- Compare: SELECT COUNT(*) FROM rawCRSCDL.dbo.[CRS_Output_2020 09 21] = 162874
- """ # noqa
+ """ # noqa: E501
)
engine = create_engine(url, echo=echo)
result = engine.execute(sql) # type: CursorResult
@@ -1344,7 +1344,7 @@ def validate_2_fetch_pcmis(
-- Final count: 93347 (on 2022-05-26).
-- Compare: SELECT COUNT(*) FROM rawPCMIS.dbo.PatientDetails = 94344.
- """ # noqa
+ """ # noqa: E501
)
engine = create_engine(url, echo=echo)
result = engine.execute(sql) # type: CursorResult
@@ -1775,7 +1775,7 @@ def validate_2_fetch_rio(
-- 2022-06-16) after removal of NNNStatus clause.
-- Compare: SELECT COUNT(*) FROM RiO62CAMLive.dbo.ClientIndex = 216739
-- Compare: SELECT COUNT(*) FROM RiO62CAMLive.dbo.Client = 216739
- """ # noqa
+ """ # noqa: E501
)
engine = create_engine(url, echo=echo)
result = engine.execute(sql) # type: CursorResult
@@ -2072,7 +2072,7 @@ def validate_2_fetch_systmone(
-- Final count: 613175 (2022-06-01).
-- Compare: SELECT COUNT(*) FROM SystmOne.dbo.S1_Patient = 619062.
- """ # noqa
+ """ # noqa: E501
)
engine = create_engine(url, echo=echo)
result = engine.execute(sql) # type: CursorResult
@@ -2385,7 +2385,7 @@ def help_v2_compare(plaintext: bool) -> str:
set {EnvVar.COMMON_OPTIONS}=--{Switches.POPULATION_SIZE} {CAMBS_POPULATION} --{Switches.EXTRA_VALIDATION_OUTPUT}
cd "%{EnvVar.DATADIR}%"
{help_v2_compare(plaintext=False)}
-""" # noqa
+""" # noqa: E501
# Skipped: {help_v2_compare(plaintext=True)}
diff --git a/crate_anon/nlp_manager/all_processors.py b/crate_anon/nlp_manager/all_processors.py
index f202e3f9..0c9f9863 100644
--- a/crate_anon/nlp_manager/all_processors.py
+++ b/crate_anon/nlp_manager/all_processors.py
@@ -67,6 +67,7 @@
from crate_anon.nlp_manager.parse_substance_misuse import (
ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS,
)
+from crate_anon.nlp_webserver.server_processor import ServerProcessor
ClassType = Type[object]
@@ -119,12 +120,16 @@ def all_local_parser_classes() -> List[Type[BaseNlpParser]]:
Return all classes that are non-abstract subclasses of
:class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`.
+ ... but not test parsers.
+
Checks that they all have unique names in lower case.
"""
# noinspection PyTypeChecker
classes = get_all_subclasses(
BaseNlpParser
) # type: List[Type[BaseNlpParser]]
+ classes = [cls for cls in classes if not cls.is_test_nlp_parser]
+
lower_case_short_names = set() # type: Set[str]
lower_case_full_names = set() # type: Set[str]
for cls in classes:
@@ -209,40 +214,36 @@ def make_nlp_parser(
raise ValueError(f"Unknown NLP processor type: {classname!r}")
-def make_nlp_parser_unconfigured(
- classname: str, raise_if_absent: bool = True
-) -> Optional[TableMaker]:
+def possible_local_processor_names() -> List[str]:
"""
- Get a debugging (unconfigured) instance of an NLP parser.
-
- Args:
- classname: the name of the NLP parser class
- raise_if_absent: raise ``ValueError`` if there is no match?
-
- Returns:
- the class, or ``None`` if there isn't one with that name
-
+ Returns all NLP processor names that can run locally.
"""
- cls = get_nlp_parser_class(classname)
- if cls:
- return cls(nlpdef=None, cfg_processor_name=None)
- if raise_if_absent:
- raise ValueError(f"Unknown NLP processor type: {classname!r}")
- return None
+ return [cls.classname() for cls in all_local_parser_classes()]
-def possible_local_processor_names() -> List[str]:
+def all_nlp_processor_classes() -> List[Type[TableMaker]]:
"""
- Returns all NLP processor names that can run locally.
+ Returns all NLP processor classes.
"""
- return [cls.classname() for cls in all_local_parser_classes()]
+ return all_tablemaker_classes()
def possible_processor_names_including_cloud() -> List[str]:
"""
Returns all NLP processor names.
"""
- return [cls.classname() for cls in all_tablemaker_classes()]
+ return [cls.classname() for cls in all_nlp_processor_classes()]
+
+
+def all_local_processor_classes_without_external_tools() -> (
+ List[Type[BaseNlpParser]]
+):
+ """
+ Returns all NLP processor classes that don't rely on external tools.
+ """
+ return [
+ cls for cls in all_local_parser_classes() if not cls.uses_external_tool
+ ]
def possible_local_processor_names_without_external_tools() -> List[str]:
@@ -252,8 +253,7 @@ def possible_local_processor_names_without_external_tools() -> List[str]:
"""
return [
cls.classname()
- for cls in all_local_parser_classes()
- if not cls.uses_external_tool
+ for cls in all_local_processor_classes_without_external_tools()
]
@@ -287,10 +287,33 @@ def all_crate_python_processors_nlprp_processor_info(
list: list of processor information dictionaries
"""
allprocs = [] # type: JsonArrayType
- for cls in all_local_parser_classes():
+ for cls in all_local_processor_classes_without_external_tools():
instance = cls(None, None)
proc_info = instance.nlprp_processor_info(sql_dialect=sql_dialect)
if extra_dict:
proc_info.update(extra_dict)
allprocs.append(proc_info)
return allprocs
+
+
+def register_all_crate_python_processors_with_serverprocessor(
+ set_parser: bool = True,
+) -> None:
+ """
+ Somewhat ugly. Register all CRATE Python NLP processors with the
+ ServerProcessor class.
+
+ See also crate_anon/nlp_webserver/procs.py, for a similar thing from JSON.
+
+ Args:
+ set_parser:
+ Set up a "free-floating" parser too?
+ """
+ for cls in all_local_processor_classes_without_external_tools():
+ instance = cls(None, None)
+ _proc = instance.nlprp_processor_info()
+ _x = ServerProcessor.from_nlprp_json_dict(_proc)
+ # ... registers with the ServerProcessor class
+ # Doing this here saves time per request
+ if set_parser:
+ _x.set_parser()
diff --git a/crate_anon/nlp_manager/base_nlp_parser.py b/crate_anon/nlp_manager/base_nlp_parser.py
index 6c07208e..a281558a 100644
--- a/crate_anon/nlp_manager/base_nlp_parser.py
+++ b/crate_anon/nlp_manager/base_nlp_parser.py
@@ -67,6 +67,7 @@
COMMENT,
TABLE_KWARGS,
)
+from crate_anon.common.sql import decorate_index_name
from crate_anon.common.stringfunc import (
compress_docstring,
does_text_contain_word_chars,
@@ -79,7 +80,7 @@
full_sectionname,
NlpConfigPrefixes,
ProcessorConfigKeys,
- GateFieldNames as GateFN,
+ GateFieldNames,
SqlTypeDbIdentifier,
MAX_SQL_FIELD_LEN,
)
@@ -198,7 +199,7 @@ def fully_qualified_classname(cls) -> str:
Returns the class's fully qualified name.
"""
# This may be imperfect; see
- # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa
+ # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa: E501
# https://www.python.org/dev/peps/pep-3155/
return ".".join([cls.__module__, cls.__qualname__])
@@ -227,6 +228,9 @@ def dest_tables_indexes(self) -> Dict[str, List[Index]]:
Describes indexes that this NLP processor suggests for its destination
table(s).
+ It is perfectly legitimate for the list not to include some tables, or
+ indeed to be empty.
+
Returns:
dict: a dictionary of ``{tablename: indexes}``, where ``indexes``
is a list of SQLAlchemy :class:`Index` objects.
@@ -400,39 +404,50 @@ def _standard_gate_columns() -> List[Column]:
"""
return [
Column(
- GateFN.SET, SqlTypeDbIdentifier, comment="GATE output set name"
+ GateFieldNames.SET,
+ SqlTypeDbIdentifier,
+ comment="GATE output set name",
),
Column(
- GateFN.TYPE,
+ GateFieldNames.TYPE,
SqlTypeDbIdentifier,
comment="GATE annotation type name",
),
Column(
- GateFN.ID,
+ GateFieldNames.ID,
Integer,
comment="GATE annotation ID (not clear this is very useful)",
),
Column(
- GateFN.STARTPOS,
+ GateFieldNames.STARTPOS,
Integer,
comment="Start position in the content",
),
Column(
- GateFN.ENDPOS, Integer, comment="End position in the content"
+ GateFieldNames.ENDPOS,
+ Integer,
+ comment="End position in the content",
),
Column(
- GateFN.CONTENT,
+ GateFieldNames.CONTENT,
Text,
comment="Full content marked as relevant.",
),
]
- @staticmethod
- def _standard_gate_indexes() -> List[Index]:
+ def _standard_gate_indexes(self, dest_tablename: str) -> List[Index]:
"""
Returns standard indexes for GATE output.
"""
- return [Index("_idx__set", GateFN.SET, mysql_length=MAX_SQL_FIELD_LEN)]
+ return [
+ Index(
+ decorate_index_name(
+ "_idx__set", dest_tablename, self.dest_engine
+ ),
+ GateFieldNames.SET,
+ mysql_length=MAX_SQL_FIELD_LEN,
+ )
+ ]
@lru_cache(maxsize=None)
def tables(self) -> Dict[str, Table]:
@@ -476,7 +491,9 @@ def tables(self) -> Dict[str, Table]:
copyindexes_list = [i.get_copy_indexes() for i in ifconfigs]
self._assert_index_lists_identical(copyindexes_list)
copy_indexes = copyindexes_list[0]
- core_indexes = InputFieldConfig.get_core_indexes_for_dest()
+ core_indexes = InputFieldConfig.get_core_indexes_for_dest(
+ tablename=tablename, engine=self._destdb.engine
+ )
column_like_things = (
copy_of_cols + core_indexes + extra_dest_indexes + copy_indexes
@@ -514,9 +531,12 @@ def get_table(self, tablename: str) -> Table:
try:
return tables[tablename]
except KeyError:
+ all_tablenames = list(tables.keys())
raise KeyError(
- f"No destination table for this NLP processor "
- f"named {tablename!r}"
+ f"For this NLP processor ({self._cfg_processor_name!r}), the "
+ f"destination table named {tablename!r} does not have an "
+ f"associated Table object. Known Table objects are "
+ f"named {all_tablenames}"
)
def make_tables(self, drop_first: bool = False) -> List[str]:
@@ -568,7 +588,7 @@ def delete_dest_record(
If you don't do this, we will get deadlocks in incremental mode.
See e.g.
https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html
- """ # noqa
+ """ # noqa: E501
session = self.dest_session
srcdb = ifconfig.srcdb
srctable = ifconfig.srctable
@@ -685,6 +705,7 @@ class BaseNlpParser(TableMaker):
"""
uses_external_tool = False # may be overridden
+ is_test_nlp_parser = False # may be overridden by tests!
def __init__(
self,
@@ -855,7 +876,7 @@ def describe_sqla_col(
# dialect = MSDialect()
column_type = column.type.compile(dialect)
data_type = column_type.partition("(")[0]
- # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa
+ # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa: E501
return {
NlprpKeys.COLUMN_NAME: column.name,
NlprpKeys.COLUMN_TYPE: column_type,
diff --git a/crate_anon/nlp_manager/build_medex_itself.py b/crate_anon/nlp_manager/build_medex_itself.py
index ab4c226c..3460e1e1 100755
--- a/crate_anon/nlp_manager/build_medex_itself.py
+++ b/crate_anon/nlp_manager/build_medex_itself.py
@@ -591,7 +591,7 @@ def main() -> None:
(certainly the &/&& one!) or else they wouldn't print a stack trace and chug
on.
- """ # noqa
+ """ # noqa: E501
for bf in bugfixes:
filename = bf["filename"]
diff --git a/crate_anon/nlp_manager/cloud_parser.py b/crate_anon/nlp_manager/cloud_parser.py
index 6473e49c..ca677b96 100644
--- a/crate_anon/nlp_manager/cloud_parser.py
+++ b/crate_anon/nlp_manager/cloud_parser.py
@@ -25,12 +25,10 @@
Send text to a cloud-based NLPRP server for processing.
-.. todo:: cloud_parser: handle new ``tabular_schema`` info from server
-
"""
import logging
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
from cardinal_pythonlib.lists import chunks
from sqlalchemy.schema import Column, Index
@@ -39,7 +37,7 @@
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.constants import ProcessorConfigKeys, NlpDefValues
from crate_anon.nlp_manager.output_user_config import OutputUserConfig
-from crate_anon.nlprp.constants import NlprpKeys as NKeys, NlprpValues
+from crate_anon.nlprp.constants import NlprpKeys, NlprpValues
from crate_anon.nlp_manager.base_nlp_parser import TableMaker
from crate_anon.nlp_webserver.server_processor import ServerProcessor
@@ -86,10 +84,9 @@ def __init__(
self.sql_dialect = None
self.schema = None # type: Optional[Dict[str, Any]]
self.available_remotely = False # update later if available
- # Output section - bit of repetition from the 'Gate' parser
+ # Output section
self._outputtypemap = {} # type: Dict[str, OutputUserConfig]
self._type_to_tablename = {} # type: Dict[str, str]
- self.tablename = None
if not nlpdef and not cfg_processor_name:
# Debugging only
@@ -107,24 +104,30 @@ def __init__(
self.format = self._cfgsection.opt_str(
ProcessorConfigKeys.PROCESSOR_FORMAT, required=True
)
-
# Output section - bit of repetition from the 'Gate' parser
typepairs = self._cfgsection.opt_strlist(
ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False
)
- # If typepairs is empty the following block won't execute
for output_type, outputsection in chunks(typepairs, 2):
output_type = output_type.lower()
c = OutputUserConfig(
- nlpdef.parser, outputsection, schema_required=False
+ config_parser=nlpdef.parser,
+ cfg_output_name=outputsection,
+ schema_required=False,
)
self._outputtypemap[output_type] = c
self._type_to_tablename[output_type] = c.dest_tablename
- if output_type == '""':
- self.tablename = c.dest_tablename
+ # Also, ensure the user doesn't specify desttable (would be
+ # confusing).
+ if self._cfgsection.opt_str(ProcessorConfigKeys.DESTTABLE):
+ raise ValueError(
+ f"For cloud processors, don't specify "
+ f"{ProcessorConfigKeys.DESTTABLE!r}; table information is "
+ f"in {ProcessorConfigKeys.OUTPUTTYPEMAP!r}"
+ )
@staticmethod
- def get_coltype_parts(coltype_str: str) -> List[str]:
+ def get_coltype_parts(coltype_str: str) -> Tuple[str, Union[str, int]]:
"""
Get root column type and parameter, i.e. for VARCHAR(50)
root column type is VARCHAR and parameter is 50.
@@ -136,7 +139,7 @@ def get_coltype_parts(coltype_str: str) -> List[str]:
else:
try:
col_str, parameter = parts
- except ValueError:
+ except ValueError: # e.g. "too many values to unpack"
log.error(f"Invalid column type in response: {coltype_str}")
raise
try:
@@ -144,17 +147,24 @@ def get_coltype_parts(coltype_str: str) -> List[str]:
parameter = int(parameter)
except ValueError:
pass
- return [col_str, parameter]
+ return col_str, parameter
@staticmethod
- def str_to_coltype_general(coltype_str: str) -> Type[sqlatypes.TypeEngine]:
+ def data_type_str_to_coltype(
+ data_type_str: str,
+ ) -> Type[sqlatypes.TypeEngine]:
"""
- Get the sqlalchemy column type class which fits with the column type.
+ Get the SQLAlchemy column type class which fits with the data type
+ specified. Currently we IGNORE self.sql_dialect.
"""
- coltype = getattr(sqlatypes, coltype_str)
+ coltype = getattr(sqlatypes, data_type_str)
# Check if 'coltype' is really an sqlalchemy column type
if issubclass(coltype, sqlatypes.TypeEngine):
return coltype
+ raise NotImplementedError(
+ f"Don't know the SQLAlchemy column type corresponding to "
+ f"data type: {data_type_str!r}"
+ )
def is_tabular(self) -> bool:
"""
@@ -163,27 +173,78 @@ def is_tabular(self) -> bool:
"""
return self.schema_type == NlprpValues.TABULAR
+ def get_tabular_schema_tablenames(self) -> List[str]:
+ """
+ Returns the names of the tables in the tabular schema (or an empty list
+ if we do not have a tabular schema).
+ """
+ if not self.is_tabular():
+ return []
+ return list(self.schema.keys())
+
+ def get_local_from_remote_tablename(self, remote_tablename: str) -> str:
+ """
+ When the remote server specifies a table name, we need to map it to
+ a local database table name.
+
+ Raises KeyError on failure.
+ """
+ try:
+ return self.get_tablename_from_type(remote_tablename)
+ except KeyError:
+ raise KeyError(
+ "No local table name defined for remote table "
+ f"{remote_tablename!r}"
+ )
+
+ def get_first_local_tablename(self) -> str:
+ """
+ Used in some circumstances when the remote processor doesn't specify
+ a table.
+ """
+ assert len(self._type_to_tablename) > 0
+ return self._type_to_tablename[0]
+
def get_tablename_from_type(self, output_type: str) -> str:
- return self._type_to_tablename[output_type]
+ """
+ For simple remote GATE processors, or cloud processors: for a given
+ annotation type (GATE) or remote table name (cloud), return the
+ destination table name.
+
+ Enforces lower-case lookup.
+
+ Will raise KeyError if this fails.
+ """
+ return self._type_to_tablename[output_type.lower()]
def get_otconf_from_type(self, output_type: str) -> OutputUserConfig:
- return self._outputtypemap[output_type]
+ """
+ For a GATE annotation type, or cloud remote table name, return the
+ corresponding OutputUserConfig.
+
+ Enforces lower-case lookup.
+
+ Will raise KeyError if this fails.
+ """
+ return self._outputtypemap[output_type.lower()]
def _standard_columns_if_gate(self) -> List[Column]:
"""
Returns standard columns for GATE output if ``self.format`` is GATE.
+ Returns an empty list otherwise.
"""
if self.format == NlpDefValues.FORMAT_GATE:
return self._standard_gate_columns()
else:
return []
- def _standard_indexes_if_gate(self) -> List[Index]:
+ def _standard_indexes_if_gate(self, dest_tablename: str) -> List[Index]:
"""
Returns standard indexes for GATE output if ``self.format`` is GATE.
+ Returns an empty list otherwise.
"""
if self.format == NlpDefValues.FORMAT_GATE:
- return self._standard_gate_indexes()
+ return self._standard_gate_indexes(dest_tablename)
else:
return []
@@ -198,17 +259,15 @@ def set_procinfo_if_correct(
self, remote_processor: ServerProcessor
) -> None:
"""
- Checks if a processor dictionary, with all the nlprp specified info
+ Checks if a processor dictionary, with all the NLPLP-specified info
a processor should have, belongs to this processor. If it does, then
we add the information from the procesor dictionary.
"""
if self.procname != remote_processor.name:
return
- # if ((self.procversion is None and
- # processor_dict[NKeys.IS_DEFAULT_VERSION]) or
- if (
- self.procversion is None and remote_processor.is_default_version
- ) or (self.procversion == remote_processor.version):
+ if (remote_processor.is_default_version and not self.procversion) or (
+ self.procversion == remote_processor.version
+ ):
self._set_processor_info(remote_processor)
def _set_processor_info(self, remote_processor: ServerProcessor) -> None:
@@ -222,122 +281,122 @@ def _set_processor_info(self, remote_processor: ServerProcessor) -> None:
self.remote_processor_info = remote_processor
# self.name = processor_dict[NKeys.NAME]
self.schema_type = remote_processor.schema_type
- if self.is_tabular():
+ if remote_processor.is_tabular():
self.schema = remote_processor.tabular_schema
self.sql_dialect = remote_processor.sql_dialect
# Check that, by this stage, we either have a tabular schema from
# the processor, or we have user-specified destfields
- assert self.is_tabular or all(
+ assert self.is_tabular() or all(
x.destfields for x in self._outputtypemap.values()
), (
"You haven't specified a table structure and the processor hasn't "
"provided one."
)
- def _str_to_coltype(self, data_type_str: str) -> sqlatypes.TypeEngine:
- """
- This is supposed to get column types depending on the sql dialect
- used by the server, but it's not implemented yet.
+ def dest_tables_columns(self) -> Dict[str, List[Column]]:
"""
- raise NotImplementedError
- # if self.sql_dialect == SqlDialects.MSSQL:
- # return self._str_to_coltype_mssql(data_type_str)
- # elif self.sql_dialect == SqlDialects.MYSQL:
- # return self._str_to_coltype_mysql(data_type_str)
- # elif self.sql_dialect == SqlDialects.ORACLE:
- # return self._str_to_coltype_oracle(data_type_str)
- # elif self.sql_dialect == SqlDialects.POSTGRES:
- # return self._str_to_coltype_postgres(data_type_str)
- # elif self.sql_dialect == SqlDialects.SQLITE:
- # return self._str_to_coltype_sqlite(data_type_str)
- # else:
- # pass
+ Describes the destination table(s) that this NLP processor wants to
+ write to.
- def _dest_tables_columns_user(self) -> Dict[str, List[Column]]:
- tables = {} # type: Dict[str, List[Column]]
+ Returns:
+ dict: a dictionary of ``{tablename: destination_columns}``, where
+ ``destination_columns`` is a list of SQLAlchemy :class:`Column`
+ objects.
- for output_type, otconfig in self._outputtypemap.items():
- tables[
- otconfig.dest_tablename
- ] = self._standard_columns_if_gate() + otconfig.get_columns(
- self.dest_engine
- )
- return tables
+ If there is an NLPRP remote table specification (tabular_schema
+ method), we start with that.
- def _dest_tables_indexes_user(self) -> Dict[str, List[Index]]:
- tables = {} # type: Dict[str, List[Index]]
- for output_type, otconfig in self._outputtypemap.items():
- tables[otconfig.dest_tablename] = (
- self._standard_indexes_if_gate() + otconfig.indexes
- )
- return tables
+ Then we add any user-defined tables. If there is both a remote
+ definition and a local definition, the local definition overrides the
+ remote definition. If the destination table info has no columns,
+ however, it is not used for table creation.
- def _dest_tables_columns_auto(self) -> Dict[str, List[Column]]:
- """
- Gets the destination tables and their columns using the remote
- processor information.
+ There may in principle be other tables too in the local config that are
+ absent in the remote info (unusual!).
"""
- tables = {}
- for table, columns in self.schema.items():
- column_objects = (
- self._standard_columns_if_gate()
- ) # type: List[Column]
- if self.tablename:
- tablename = self.tablename
- else:
- tablename = self.get_tablename_from_type(table)
- # ... might be empty list
- for column in columns:
- col_str, parameter = self.get_coltype_parts(
- column[NKeys.COLUMN_TYPE]
+ table_columns = {} # type: Dict[str, List[Column]]
+
+ # 1. NLPRP remote specification.
+ if self.is_tabular():
+ for remote_tablename, columndefs in self.schema.items():
+ # We may start with predefined GATE columns (but this might
+ # return an empty list). We'll then add to it, if additional
+ # information is provided.
+ column_objects = [] # type: List[Column]
+ dest_tname = self.get_local_from_remote_tablename(
+ remote_tablename
)
- data_type_str = column[NKeys.DATA_TYPE]
- coltype = self.str_to_coltype_general(data_type_str)
- column_objects.append(
- Column(
- column[NKeys.COLUMN_NAME],
- coltype if not parameter else coltype(parameter),
- comment=column[NKeys.COLUMN_COMMENT],
- nullable=column[NKeys.IS_NULLABLE],
+ column_renames = self.get_otconf_from_type(
+ remote_tablename
+ ).renames
+ for column_info in columndefs:
+ colname = column_info[NlprpKeys.COLUMN_NAME]
+ # Rename (or keep the same if no applicable rename):
+ colname = column_renames.get(colname, colname)
+ col_str, parameter = self.get_coltype_parts(
+ column_info[NlprpKeys.COLUMN_TYPE]
+ )
+ data_type_str = column_info[NlprpKeys.DATA_TYPE]
+ # We could use col_str or data_type_str here.
+ coltype = self.data_type_str_to_coltype(data_type_str)
+ column_objects.append(
+ Column(
+ name=colname,
+ type_=coltype(parameter) if parameter else coltype,
+ comment=column_info.get(NlprpKeys.COLUMN_COMMENT),
+ nullable=column_info[NlprpKeys.IS_NULLABLE],
+ )
+ )
+ if not column_objects:
+ raise ValueError(
+ "Remote error: NLPRP server declares table "
+ f"{remote_tablename!r} but provides no column "
+ "information for it."
)
+ table_columns[dest_tname] = column_objects
+
+ # 2. User specification.
+ for output_type, otconfig in self._outputtypemap.items():
+ if otconfig.destfields:
+ # The user has specified columns.
+ table_columns[
+ otconfig.dest_tablename
+ ] = self._standard_columns_if_gate() + otconfig.get_columns(
+ self.dest_engine
)
- tables[tablename] = column_objects
- return tables
+ else:
+ # The user has noted the existence of the table, but hasn't
+ # specified columns.
+ if otconfig.dest_tablename not in table_columns:
+ raise ValueError(
+ f"Local table {otconfig.dest_tablename!r} has no "
+ "remote definition, and no columns are defined for it "
+ "in the config file either."
+ )
+ # Otherwise: defined remotely, with no local detail; that's OK.
+ continue
- def _dest_tables_indexes_auto(self) -> Dict[str, List[Index]]:
- if self.format != NlpDefValues.FORMAT_GATE:
- return {} # indexes can't be returned by the server
- tables = {} # type: Dict[str, List[Index]]
- for table in self.schema:
- tables[table] = self._standard_gate_indexes()
- return tables
+ # Done.
+ return table_columns
def dest_tables_indexes(self) -> Dict[str, List[Index]]:
- # Docstring in superclass
- if self._outputtypemap and all(
- x.destfields for x in self._outputtypemap.values()
- ):
- return self._dest_tables_indexes_user()
- elif self.is_tabular():
- return self._dest_tables_indexes_auto()
- else:
- raise ValueError(
- "You haven't specified a table structure and "
- "the processor hasn't provided one."
- )
+ """
+ Describes indexes that this NLP processor suggests for its destination
+ table(s).
- def dest_tables_columns(self) -> Dict[str, List[Column]]:
- # Docstring in superclass
- if self._outputtypemap and all(
- x.destfields for x in self._outputtypemap.values()
- ):
- return self._dest_tables_columns_user()
- elif self.is_tabular():
- # Must have processor-defined schema because we already checked
- # for it
- return self._dest_tables_columns_auto()
- else:
- raise ValueError(
- "You haven't specified a table structure and "
- "the processor hasn't provided one."
+ Returns:
+ dict: a dictionary of ``{tablename: indexes}``, where ``indexes``
+ is a list of SQLAlchemy :class:`Index` objects.
+
+ The NLPRP remote table specification doesn't include indexing. So all
+ indexing information is from our config file, whether for GATE or
+ cloud processors.
+ """
+ table_indexes = {} # type: Dict[str, List[Index]]
+ for output_type, otconfig in self._outputtypemap.items():
+ dest_tablename = otconfig.dest_tablename
+ table_indexes[dest_tablename] = (
+ self._standard_indexes_if_gate(dest_tablename)
+ + otconfig.indexes
)
+ return table_indexes
diff --git a/crate_anon/nlp_manager/cloud_request.py b/crate_anon/nlp_manager/cloud_request.py
index 3f75ac90..34928edc 100644
--- a/crate_anon/nlp_manager/cloud_request.py
+++ b/crate_anon/nlp_manager/cloud_request.py
@@ -55,18 +55,26 @@
from semantic_version import Version
from urllib3.exceptions import NewConnectionError
-from crate_anon.common.constants import JSON_INDENT, JSON_SEPARATORS_COMPACT
+from crate_anon.common.constants import (
+ JSON_INDENT,
+ JSON_SEPARATORS_COMPACT,
+ NoneType,
+)
from crate_anon.common.memsize import getsize
from crate_anon.common.stringfunc import does_text_contain_word_chars
from crate_anon.nlp_manager.cloud_parser import Cloud
from crate_anon.nlp_manager.constants import (
FN_NLPDEF,
+ FN_SRCPKSTR,
+ FN_SRCPKVAL,
FN_WHEN_FETCHED,
- full_sectionname,
+ GateFieldNames,
GateResultKeys,
NlpConfigPrefixes,
NlpDefValues,
+ full_sectionname,
)
+from crate_anon.nlp_manager.models import FN_SRCHASH
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlprp.api import (
@@ -78,7 +86,7 @@
)
from crate_anon.nlprp.constants import (
NlprpCommands,
- NlprpKeys as NKeys,
+ NlprpKeys,
NlprpValues,
NlprpVersions,
)
@@ -121,12 +129,12 @@ def report_processor_errors(processor_data: Dict[str, Any]) -> None:
Should only be called if there has been an error. Reports the error(s) to
the log.
"""
- name = processor_data[NKeys.NAME]
- version = processor_data[NKeys.VERSION]
+ name = processor_data[NlprpKeys.NAME]
+ version = processor_data[NlprpKeys.VERSION]
error_messages = "\n".join(
- f"{error[NKeys.CODE]} - {error[NKeys.MESSAGE]}: "
- f"{error[NKeys.DESCRIPTION]}"
- for error in processor_data[NKeys.ERRORS]
+ f"{error[NlprpKeys.CODE]} - {error[NlprpKeys.MESSAGE]}: "
+ f"{error[NlprpKeys.DESCRIPTION]}"
+ for error in processor_data[NlprpKeys.ERRORS]
)
log.error(
f"Processor {name!r} (version {version}) failed for this "
@@ -134,6 +142,166 @@ def report_processor_errors(processor_data: Dict[str, Any]) -> None:
)
+def extract_nlprp_top_level_results(nlp_data: JsonObjectType) -> List:
+ """
+ Checks that the top-level NLP response contains an appropriate "results"
+ object, or raises KeyError or ValueError.
+
+ Returns the list result, which is a list of results per document.
+ """
+ try:
+ docresultlist = nlp_data[NlprpKeys.RESULTS]
+ except KeyError:
+ raise KeyError(
+ "Top-level response does not contain key "
+ f"{NlprpKeys.RESULTS!r}: {nlp_data!r}"
+ )
+ if not isinstance(docresultlist, list):
+ raise ValueError(
+ f"{NlprpKeys.RESULTS!r} value is not a list: {docresultlist!r}"
+ )
+ return docresultlist
+
+
+def parse_nlprp_docresult_metadata(
+ docresult: JsonObjectType,
+) -> Tuple[Dict[str, Any], Optional[int], Optional[str], str]:
+ """
+ Check that this NLPRP document result validly contains metadata, and that
+ metadata contains things we always send. Extract key components. Provide
+ helpful error message on failure.
+
+ Returns:
+ tuple (metadata, pkval, pkstr, srchhash)
+
+ """
+ try:
+ metadata = docresult[NlprpKeys.METADATA]
+ except KeyError:
+ raise KeyError(
+ "Document result does not contain key "
+ f"{NlprpKeys.METADATA!r}: {docresult!r}"
+ )
+ if not isinstance(metadata, dict):
+ # ... expected type because that's what we sent; see add_text()
+ raise KeyError(f"Document result metadata is not a dict: {metadata!r}")
+
+ try:
+ pkval = metadata[FN_SRCPKVAL]
+ except KeyError:
+ raise KeyError(
+ "Document metadata does not contain key "
+ f"{FN_SRCPKVAL!r}: {metadata!r}"
+ )
+ if not isinstance(pkval, (int, NoneType)):
+ # ... expected type because that's what we sent; see add_text()
+ raise KeyError(
+ f"Document result metadata {FN_SRCPKVAL!r} is not null or int: "
+ f"{pkval!r}"
+ )
+
+ try:
+ pkstr = metadata[FN_SRCPKSTR]
+ except KeyError:
+ raise KeyError(
+ "Document metadata does not contain key "
+ f"{FN_SRCPKSTR!r}: {metadata!r}"
+ )
+ if not isinstance(pkstr, (str, NoneType)):
+ raise KeyError(
+ f"Document result metadata {FN_SRCPKVAL!r} is not null or str: "
+ f"{pkstr!r}"
+ )
+
+ if pkval is None and pkstr is None:
+ raise ValueError(
+ f"In document result, both {FN_SRCPKVAL!r} and "
+ f"{FN_SRCPKSTR!r} are null"
+ )
+
+ try:
+ srchash = metadata[FN_SRCHASH]
+ except KeyError:
+ raise KeyError(
+ "Document metadata does not contain key "
+ f"{FN_SRCPKSTR!r}: {metadata!r}"
+ )
+ if not isinstance(srchash, str):
+ raise KeyError(
+ f"Document result metadata {FN_SRCPKSTR!r} is not str: "
+ f"{srchash!r}"
+ )
+
+ return metadata, pkval, pkstr, srchash
+
+
+def extract_processor_data_list(
+ docresult: JsonObjectType,
+) -> List[JsonObjectType]:
+ """
+ Check and extract a list of per-processor results from a single-document
+ NLPRP result.
+ """
+ try:
+ processor_data_list = docresult[NlprpKeys.PROCESSORS]
+ except KeyError:
+ raise KeyError(
+ "Document result does not contain key "
+ f"{NlprpKeys.PROCESSORS!r}: {docresult!r}"
+ )
+ if not isinstance(processor_data_list, list):
+ raise ValueError(
+ f"Document result's {NlprpKeys.PROCESSORS!r} element is not a "
+ f"list: {processor_data_list!r}"
+ )
+ return processor_data_list
+
+
+def parse_per_processor_data(processor_data: Dict[str, Any]) -> Tuple:
+ """
+ Return a tuple of mandatory results from NLPRP per-processor data, or raise
+ KeyError.
+ """
+ if not isinstance(processor_data, dict):
+ raise ValueError(f"Processor result is not a dict: {processor_data!r}")
+
+ try:
+ name = processor_data[NlprpKeys.NAME]
+ except KeyError:
+ raise KeyError(
+ "Processor result does not contain key "
+ f"{NlprpKeys.NAME!r}: {processor_data!r}"
+ )
+
+ try:
+ version = processor_data[NlprpKeys.VERSION]
+ except KeyError:
+ raise KeyError(
+ "Processor result does not contain key "
+ f"{NlprpKeys.VERSION!r}: {processor_data!r}"
+ )
+
+ is_default_version = processor_data.get(NlprpKeys.IS_DEFAULT_VERSION, True)
+
+ try:
+ success = processor_data[NlprpKeys.SUCCESS]
+ except KeyError:
+ raise KeyError(
+ "Processor result does not contain key "
+ f"{NlprpKeys.SUCCESS!r}: {processor_data!r}"
+ )
+
+ try:
+ processor_results = processor_data[NlprpKeys.RESULTS]
+ except KeyError:
+ raise KeyError(
+ "Processor result does not contain key "
+ f"{NlprpKeys.RESULTS!r}: {processor_data!r}"
+ )
+
+ return name, version, is_default_version, success, processor_results
+
+
# =============================================================================
# Exceptions
# =============================================================================
@@ -370,38 +538,63 @@ def get_remote_processors(self) -> List[ServerProcessor]:
"""
# Make request
list_procs_request = make_nlprp_dict()
- list_procs_request[NKeys.COMMAND] = NlprpCommands.LIST_PROCESSORS
+ list_procs_request[NlprpKeys.COMMAND] = NlprpCommands.LIST_PROCESSORS
request_json = to_json_str(list_procs_request)
# Send request, get response
json_response = self._post_get_json(request_json, may_fail=False)
- status = json_get_int(json_response, NKeys.STATUS)
+ status = json_get_int(json_response, NlprpKeys.STATUS)
if not HttpStatus.is_good_answer(status):
- errors = json_get_array(json_response, NKeys.ERRORS)
+ errors = json_get_array(json_response, NlprpKeys.ERRORS)
for err in errors:
log.error(f"Error received: {err!r}")
raise HTTPError(f"Response status was: {status}")
processors = [] # type: List[ServerProcessor]
- proclist = json_response[NKeys.PROCESSORS] # type: JsonArrayType
- for procinfo in proclist:
- proc = ServerProcessor(
- # Mandatory:
- name=procinfo[NKeys.NAME],
- title=procinfo[NKeys.TITLE],
- version=procinfo[NKeys.VERSION],
- is_default_version=procinfo.get(
- NKeys.IS_DEFAULT_VERSION, True
- ),
- description=procinfo[NKeys.DESCRIPTION],
- # Optional:
- schema_type=procinfo.get(
- NKeys.SCHEMA_TYPE, NlprpValues.UNKNOWN
- ),
- sql_dialect=procinfo.get(NKeys.SQL_DIALECT, ""),
- tabular_schema=procinfo.get(NKeys.TABULAR_SCHEMA),
+ try:
+ proclist = json_response[
+ NlprpKeys.PROCESSORS
+ ] # type: JsonArrayType
+ except KeyError:
+ raise KeyError(
+ f"Server did not provide key {NlprpKeys.PROCESSORS!r} in its "
+ f"response: {json_response!r}"
)
+ if not isinstance(proclist, list):
+ raise ValueError(
+ f"Server's value of {NlprpKeys.PROCESSORS!r} is not a list: "
+ f"{proclist!r}"
+ )
+ for procinfo in proclist:
+ if not isinstance(procinfo, dict):
+ raise ValueError(
+ f"Server's procinfo object not a dict: {procinfo!r}"
+ )
+ # Any of the following may raise KeyError if missing:
+ try:
+ proc = ServerProcessor(
+ # Mandatory:
+ name=procinfo[NlprpKeys.NAME],
+ title=procinfo[NlprpKeys.TITLE],
+ version=procinfo[NlprpKeys.VERSION],
+ is_default_version=procinfo.get(
+ NlprpKeys.IS_DEFAULT_VERSION, True
+ ),
+ description=procinfo[NlprpKeys.DESCRIPTION],
+ # Optional:
+ schema_type=procinfo.get(
+ NlprpKeys.SCHEMA_TYPE, NlprpValues.UNKNOWN
+ ),
+ sql_dialect=procinfo.get(NlprpKeys.SQL_DIALECT, ""),
+ tabular_schema=procinfo.get(NlprpKeys.TABULAR_SCHEMA),
+ )
+ except KeyError:
+ log.critical(
+ "NLPRP server's processor information is missing a "
+ "required field"
+ )
+ raise
processors.append(proc)
return processors
@@ -449,17 +642,17 @@ def __init__(
# Set up processing request
self._request_process = make_nlprp_dict()
- self._request_process[NKeys.COMMAND] = NlprpCommands.PROCESS
- self._request_process[NKeys.ARGS] = {
- NKeys.PROCESSORS: [], # type: List[str]
- NKeys.QUEUE: True,
- NKeys.CLIENT_JOB_ID: self._client_job_id,
- NKeys.INCLUDE_TEXT: False,
- NKeys.CONTENT: [], # type: List[str]
+ self._request_process[NlprpKeys.COMMAND] = NlprpCommands.PROCESS
+ self._request_process[NlprpKeys.ARGS] = {
+ NlprpKeys.PROCESSORS: [], # type: List[str]
+ NlprpKeys.QUEUE: True,
+ NlprpKeys.CLIENT_JOB_ID: self._client_job_id,
+ NlprpKeys.INCLUDE_TEXT: False,
+ NlprpKeys.CONTENT: [], # type: List[str]
}
# Set up fetch_from_queue request
self._fetch_request = make_nlprp_dict()
- self._fetch_request[NKeys.COMMAND] = NlprpCommands.FETCH_FROM_QUEUE
+ self._fetch_request[NlprpKeys.COMMAND] = NlprpCommands.FETCH_FROM_QUEUE
self.nlp_data = None # type: Optional[JsonObjectType]
# ... the JSON response
@@ -542,7 +735,7 @@ def compare(x: Any) -> None:
It can be quite a big overestimate, so we probably shouldn't chuck
out requests just because the Python size looks too big.
- """ # noqa
+ """ # noqa: E501
if not max_length: # None or 0
return False # no maximum; not too long
# Fast, apt to overestimate size a bit (as above)
@@ -572,10 +765,12 @@ def _add_processor_to_request(
procname: name of processor on the server
procversion: version of processor on the server
"""
- info = {NKeys.NAME: procname}
+ info = {NlprpKeys.NAME: procname}
if procversion:
- info[NKeys.VERSION] = procversion
- self._request_process[NKeys.ARGS][NKeys.PROCESSORS].append(info)
+ info[NlprpKeys.VERSION] = procversion
+ self._request_process[NlprpKeys.ARGS][NlprpKeys.PROCESSORS].append(
+ info
+ )
def _add_all_processors_to_request(self) -> None:
"""
@@ -624,10 +819,10 @@ def add_text(self, text: str, metadata: Dict[str, Any]) -> None:
if self.number_of_records > self._cloudcfg.max_records_per_request:
raise RecordsPerRequestExceeded
- new_content = {NKeys.METADATA: metadata, NKeys.TEXT: text}
+ new_content = {NlprpKeys.METADATA: metadata, NlprpKeys.TEXT: text}
# Add all the identifying information.
- args = self._request_process[NKeys.ARGS]
- content_key = NKeys.CONTENT # local copy for fractional speedup
+ args = self._request_process[NlprpKeys.ARGS]
+ content_key = NlprpKeys.CONTENT # local copy for fractional speedup
old_content = copy(args[content_key])
args[content_key].append(new_content)
max_length = self._cloudcfg.max_content_length
@@ -658,25 +853,25 @@ def send_process_request(
should the server include the source text in the reply?
"""
# Don't send off an empty request
- if not self._request_process[NKeys.ARGS][NKeys.CONTENT]:
+ if not self._request_process[NlprpKeys.ARGS][NlprpKeys.CONTENT]:
log.warning("Request empty - not sending.")
return
# Create request
if cookies:
self.cookies = cookies
- self._request_process[NKeys.ARGS][NKeys.QUEUE] = queue
- self._request_process[NKeys.ARGS][
- NKeys.INCLUDE_TEXT
+ self._request_process[NlprpKeys.ARGS][NlprpKeys.QUEUE] = queue
+ self._request_process[NlprpKeys.ARGS][
+ NlprpKeys.INCLUDE_TEXT
] = include_text_in_reply
request_json = to_json_str(self._request_process)
# Send request; get response
json_response = self._post_get_json(request_json)
- status = json_response[NKeys.STATUS]
+ status = json_response[NlprpKeys.STATUS]
if queue and status == HttpStatus.ACCEPTED:
- self.queue_id = json_response[NKeys.QUEUE_ID]
+ self.queue_id = json_response[NlprpKeys.QUEUE_ID]
self._fetched = False
elif (not queue) and status == HttpStatus.OK:
self.nlp_data = json_response
@@ -711,7 +906,9 @@ def _try_fetch(
# Create request
if cookies:
self.cookies = cookies
- self._fetch_request[NKeys.ARGS] = {NKeys.QUEUE_ID: self.queue_id}
+ self._fetch_request[NlprpKeys.ARGS] = {
+ NlprpKeys.QUEUE_ID: self.queue_id
+ }
request_json = to_json_str(self._fetch_request)
# Send request; get response
@@ -732,9 +929,9 @@ def check_if_ready(self, cookies: CookieJar = None) -> bool:
json_response = self._try_fetch(cookies)
if not json_response:
return False
- status = json_response[NKeys.STATUS]
+ status = json_response[NlprpKeys.STATUS]
pending_use_202 = (
- Version(json_response[NKeys.VERSION])
+ Version(json_response[NlprpKeys.VERSION])
>= NlprpVersions.FETCH_Q_PENDING_RETURNS_202
)
if status == HttpStatus.OK:
@@ -765,87 +962,133 @@ def check_if_ready(self, cookies: CookieJar = None) -> bool:
# -------------------------------------------------------------------------
@staticmethod
- def get_nlp_values_internal(
- processor_data: Dict[str, Any], metadata: Dict[str, Any]
- ) -> Generator[Tuple[str, Dict[str, Any], str], None, None]:
+ def gen_nlp_values_generic_single_table(
+ processor: Cloud,
+ tablename: str,
+ rows: List[Dict[str, Any]],
+ metadata: Dict[str, Any],
+ column_renames: Dict[str, str] = None,
+ ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]:
"""
- Get result values from processed data from a CRATE server-side.
+ Get result values from processed data, where the results object is a
+ list of rows (each row in dictionary format), all for a single table,
+ such as from a remote CRATE server.
+
+ Success should have been pre-verified.
Args:
- processor_data:
- NLPRP results for one processor
+ processor:
+ The processor object.
+ tablename:
+ The table name to use.
+ rows:
+ List of NLPRP results for one processor. Each result represents
+ a row of a table and is in dictionary format.
metadata:
The metadata for a particular document - it would have been
sent with the document and the server would have sent it back.
+ column_renames:
+ Column renames to apply.
- Yields ``(output_tablename, formatted_result, processor_name)``.
+ Yields ``(output_tablename, formatted_result, processor)``.
"""
- if not processor_data[NKeys.SUCCESS]:
- report_processor_errors(processor_data)
- return
- for result in processor_data[NKeys.RESULTS]:
- result.update(metadata)
- yield result
+ column_renames = column_renames or {}
+ for row in rows:
+ rename_keys_in_dict(row, column_renames)
+ row.update(metadata)
+ yield tablename, row, processor
@staticmethod
- def get_nlp_values_gate(
- processor_data: Dict[str, Any],
+ def gen_nlp_values_gate(
processor: Cloud,
+ processor_results: List[Dict[str, Any]],
metadata: Dict[str, Any],
text: str = "",
- ) -> Generator[Tuple[Dict[str, Any], Cloud], None, None]:
+ ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]:
"""
- Gets result values from processed GATE data which will originally
- be in the following format:
+ Generates row values from processed GATE data.
+
+ Success should have been pre-verified.
+
+ Args:
+ processor:
+ The processor object:
+ processor_results:
+ A list of dictionaries (originally from JSON), each
+ representing a row in a table, and each expected to have this
+ format:
+
+ .. code-block:: none
+
+ {
+ 'set': set the results belong to (e.g. 'Medication'),
+ 'type': annotation type,
+ 'start': start index,
+ 'end': end index,
+ 'features': {
+ a dictionary of features, e.g. having keys 'drug',
+ 'frequency', etc., with corresponding values
+ }
+ }
+
+ metadata:
+ The metadata for a particular document - it would have been
+ sent with the document and the server would have sent it back.
+ text:
+ The source text itself (optional).
+
+ Yields:
+
+ tuples ``(output_tablename, formatted_result, processor)``
+
+ Each instance of ``formatted_result`` has this format:
.. code-block:: none
{
- 'set': set the results belong to (e.g. 'Medication'),
- 'type': annotation type,
- 'start': start index,
- 'end': end index,
- 'features': {a dictionary of features e.g. 'drug', 'frequency', etc}
+ GateFieldNames.TYPE: annotation type,
+ GateFieldNames.SET: set,
+ GateFieldNames.STARTPOS: start index,
+ GateFieldNames.ENDPOS: end index,
+ GateFieldNames.CONTENT: text fragment,
+ FEATURE1: VALUE1,
+ FEATURE2: VALUE2,
+ ...
}
-
- Yields ``(output_tablename, formatted_result, processor_name)``.
- """ # noqa
- if not processor_data[NKeys.SUCCESS]:
- report_processor_errors(processor_data)
- return
- for result in processor_data[NKeys.RESULTS]:
- # Assuming each set of results says what annotation type
- # it is (annotation type is stored as lower case)
- annottype = result[GateResultKeys.TYPE].lower()
- features = result[GateResultKeys.FEATURES]
- start = result[GateResultKeys.START]
- end = result[GateResultKeys.END]
+ """
+ for row in processor_results:
+ # Assuming each row says what annotation type it is (annotation
+ # type is stored as lower case):
+ annottype = row[GateResultKeys.TYPE].lower()
+ features = row[GateResultKeys.FEATURES]
+ start = row[GateResultKeys.START]
+ end = row[GateResultKeys.END]
formatted_result = {
- "_type": annottype,
- "_set": result[GateResultKeys.SET],
- "_start": start,
- "_end": end,
- "_content": "" if not text else text[start:end],
+ GateFieldNames.TYPE: annottype,
+ GateFieldNames.SET: row[GateResultKeys.SET],
+ GateFieldNames.STARTPOS: start,
+ GateFieldNames.ENDPOS: end,
+ GateFieldNames.CONTENT: text[start:end] if text else "",
}
-
formatted_result.update(features)
c = processor.get_otconf_from_type(annottype)
rename_keys_in_dict(formatted_result, c.renames)
set_null_values_in_dict(formatted_result, c.null_literals)
formatted_result.update(metadata)
tablename = processor.get_tablename_from_type(annottype)
- yield tablename, formatted_result
+ yield tablename, formatted_result, processor
- def get_nlp_values(
+ def gen_nlp_values(
self,
- ) -> Generator[Tuple[Dict[str, Any], Cloud], None, None]:
+ ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]:
"""
Process response data that we have already obtained from the server,
generating individual NLP results.
Yields:
``(tablename, result, processor)`` for each result.
+ The ``tablename`` value is the actual destination database table.
Raises:
:exc:`KeyError` if an unexpected processor turned up in the results
@@ -855,21 +1098,24 @@ def get_nlp_values(
"Method 'get_nlp_values' must only be called "
"after nlp_data is obtained"
)
- for result in self.nlp_data[NKeys.RESULTS]:
- metadata = result[NKeys.METADATA] # type: Dict[str, Any]
- # ... expected type because that's what we sent; see add_text()
- text = result.get(NKeys.TEXT)
- for processor_data in result[NKeys.PROCESSORS]: # type: Dict
+ docresultlist = extract_nlprp_top_level_results(self.nlp_data)
+ for docresult in docresultlist:
+ metadata, _, _, _ = parse_nlprp_docresult_metadata(docresult)
+ text = docresult.get(NlprpKeys.TEXT)
+ processor_data_list = extract_processor_data_list(docresult)
+ for processor_data in processor_data_list:
+ # Details of the server processor that has responded:
+ (
+ name,
+ version,
+ is_default_version,
+ success,
+ processor_results,
+ ) = parse_per_processor_data(processor_data)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check that the processor was one we asked for.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- # Details of the server processor that has responded:
- name = processor_data[NKeys.NAME]
- version = processor_data[NKeys.VERSION]
- is_default_version = processor_data.get(
- NKeys.IS_DEFAULT_VERSION, True
- )
try:
# Retrieve the Python object corresponding to the server
# processor that has responded:
@@ -895,20 +1141,95 @@ def get_nlp_values(
raise KeyError(failmsg)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- # OK; we're happy with the processor. Process its results.
+ # OK; we're happy with the processor. Was it happy?
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if not success:
+ report_processor_errors(processor_data)
+ return
+
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ # All well. Process the results.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- if processor.format == NlpDefValues.FORMAT_GATE:
- for t, r in self.get_nlp_values_gate(
- processor_data, processor, metadata, text
- ):
- yield t, r, processor
+ # See nlprp.rst, .
+ if isinstance(processor_results, dict):
+ # MULTI-TABLE FORMAT.
+ # This is a dictionary mapping tables to row lists.
+ if not processor.is_tabular():
+ raise RuntimeError(
+ f"Unsupported: processor {name!r} is returning "
+ f"multi-table results but hasn't provided a "
+ f"table schema"
+ )
+ tnames = processor.get_tabular_schema_tablenames()
+ for remote_tablename, rows in processor_results.items():
+ if remote_tablename not in tnames:
+ raise ValueError(
+ f"For processor {name!r}, data provided for "
+ f"table {remote_tablename!r}, but this was "
+ "not in the schema"
+ )
+ dest_tablename = processor.get_tablename_from_type(
+ remote_tablename
+ )
+ yield from self.gen_nlp_values_generic_single_table(
+ processor=processor,
+ tablename=dest_tablename,
+ rows=rows,
+ metadata=metadata,
+ column_renames=processor.get_otconf_from_type(
+ remote_tablename
+ ).renames,
+ )
+ elif isinstance(processor_results, list):
+ # SINGLE TABLE FORMAT.
+ # This is a list of rows, where each row should be a
+ # dictionary mapping column names to values.
+ if processor.format == NlpDefValues.FORMAT_GATE:
+ # We have special knowledge of the "traditional" GATE
+ # format. The sub-function will work out the table
+ # name(s).
+ yield from self.gen_nlp_values_gate(
+ processor=processor,
+ processor_results=processor_results,
+ metadata=metadata,
+ text=text,
+ )
+ else:
+ # Potentially valid whether or not there is a
+ # tabular_schema. The results object is a generic list
+ # of column_name/value dictionaries.
+ if processor.is_tabular():
+ # Only valid here if there is a SINGLE table in
+ # the tabular_schema.
+ tnames = processor.get_tabular_schema_tablenames()
+ if len(tnames) != 1:
+ raise ValueError(
+ f"Processor {name!r} returned results in "
+ "list format, but this is only valid for "
+ "a single table; its tables are "
+ f"{tnames!r}"
+ )
+ remote_tablename = tnames[0]
+ else:
+ # We use the FIRST defined table name.
+ remote_tablename = processor.get_first_tablename()
+ dest_tablename = processor.get_tablename_from_type(
+ remote_tablename
+ )
+ yield from self.gen_nlp_values_generic_single_table(
+ processor=processor,
+ tablename=dest_tablename,
+ rows=processor_results,
+ metadata=metadata,
+ column_renames=processor.get_otconf_from_type(
+ remote_tablename
+ ).renames,
+ )
else:
- for res in self.get_nlp_values_internal(
- processor_data, metadata
- ):
- # For non-GATE processors ther will only be one table
- # name
- yield processor.tablename, res, processor
+ raise ValueError(
+ f"For processor {name!r}, bad results format: "
+ f"{processor_results!r}"
+ )
# @do_cprofile
def process_all(self) -> None:
@@ -921,7 +1242,7 @@ def process_all(self) -> None:
sessions = []
- for tablename, nlp_values, processor in self.get_nlp_values():
+ for tablename, nlp_values, processor in self.gen_nlp_values():
nlp_values[FN_NLPDEF] = nlpname
session = processor.dest_session
if session not in sessions:
@@ -973,12 +1294,12 @@ def show_queue(self) -> Optional[List[Dict[str, Any]]]:
request_json = to_json_str(show_request)
json_response = self._post_get_json(request_json, may_fail=False)
- status = json_response[NKeys.STATUS]
+ status = json_response[NlprpKeys.STATUS]
if status == HttpStatus.OK:
try:
- queue = json_response[NKeys.QUEUE]
+ queue = json_response[NlprpKeys.QUEUE]
except KeyError:
- log.error(f"Response did not contain key {NKeys.QUEUE!r}.")
+ log.error(f"Response did not contain key {NlprpKeys.QUEUE!r}.")
raise
return queue
else:
@@ -991,7 +1312,7 @@ def delete_all_from_queue(self) -> None:
"""
delete_request = make_nlprp_request(
command=NlprpCommands.DELETE_FROM_QUEUE,
- command_args={NKeys.DELETE_ALL: True},
+ command_args={NlprpKeys.DELETE_ALL: True},
)
request_json = to_json_str(delete_request)
response = self._post(request_json, may_fail=False)
@@ -1014,7 +1335,7 @@ def delete_from_queue(self, queue_ids: List[str]) -> None:
"""
delete_request = make_nlprp_request(
command=NlprpCommands.DELETE_FROM_QUEUE,
- command_args={NKeys.QUEUE_IDS: queue_ids},
+ command_args={NlprpKeys.QUEUE_IDS: queue_ids},
)
request_json = to_json_str(delete_request)
response = self._post(request_json, may_fail=False)
diff --git a/crate_anon/nlp_manager/input_field_config.py b/crate_anon/nlp_manager/input_field_config.py
index c00c9c96..01074cd0 100644
--- a/crate_anon/nlp_manager/input_field_config.py
+++ b/crate_anon/nlp_manager/input_field_config.py
@@ -45,13 +45,22 @@
table_or_view_exists,
)
from cardinal_pythonlib.timing import MultiTimerContext, timer
-from sqlalchemy import BigInteger, Column, DateTime, Index, String, Table
+from sqlalchemy import (
+ BigInteger,
+ Column,
+ DateTime,
+ Index,
+ Integer,
+ String,
+ Table,
+)
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm.session import Session
from sqlalchemy.sql import and_, column, exists, null, or_, select, table
from sqlalchemy.sql.schema import MetaData
from crate_anon.common.parallel import is_my_job_by_hash_prehashed
+from crate_anon.common.sql import decorate_index_name
from crate_anon.common.stringfunc import relevant_for_nlp
from crate_anon.nlp_manager.constants import (
FN_CRATE_VERSION_FIELD,
@@ -254,7 +263,9 @@ def get_core_columns_for_dest() -> List[Column]:
return [
Column(
FN_PK,
- BigInteger,
+ # Autoincrement under SQLite needs a trick:
+ # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html
+ BigInteger().with_variant(Integer, "sqlite"),
primary_key=True,
autoincrement=True,
comment="Arbitrary primary key (PK) of output record",
@@ -322,21 +333,28 @@ def get_core_columns_for_dest() -> List[Column]:
]
@staticmethod
- def get_core_indexes_for_dest() -> List[Index]:
+ def get_core_indexes_for_dest(
+ tablename: str, engine: Engine
+ ) -> List[Index]:
"""
Returns the core indexes to be applied to the destination tables.
Primarily, these are for columns that refer to the source.
+ Args:
+ tablename:
+ The name of the table to be used in the destination.
+ engine:
+ The destination database SQLAlchemy Engine.
+
Returns:
a list of SQLAlchemy :class:`Index` objects
See
-
- https://stackoverflow.com/questions/179085/multiple-indexes-vs-multi-column-indexes
- """ # noqa
+ """ # noqa: E501
return [
Index(
- "_idx_srcref",
+ decorate_index_name("_idx_srcref", tablename, engine),
# Remember, order matters; more to less specific
# See also BaseNlpParser.delete_dest_record
FN_SRCPKVAL,
@@ -346,9 +364,12 @@ def get_core_indexes_for_dest() -> List[Index]:
FN_SRCDB,
FN_SRCPKSTR,
),
- Index("_idx_srcdate", FN_SRCDATETIMEVAL),
Index(
- "_idx_deletion",
+ decorate_index_name("_idx_srcdate", tablename, engine),
+ FN_SRCDATETIMEVAL,
+ ),
+ Index(
+ decorate_index_name("_idx_deletion", tablename, engine),
# We sometimes delete just using the following; see
# BaseNlpParser.delete_where_srcpk_not
FN_NLPDEF,
diff --git a/crate_anon/nlp_manager/models.py b/crate_anon/nlp_manager/models.py
index db920aea..8e56aa0b 100644
--- a/crate_anon/nlp_manager/models.py
+++ b/crate_anon/nlp_manager/models.py
@@ -29,7 +29,7 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.schema import Column, Index, MetaData
-from sqlalchemy.types import BigInteger, DateTime, String
+from sqlalchemy.types import BigInteger, DateTime, Integer, String
from crate_anon.anonymise.constants import COMMENT, TABLE_KWARGS
from crate_anon.nlp_manager.constants import (
@@ -96,7 +96,8 @@ class NlpRecord(ProgressBase):
pk = Column(
"pk",
- BigInteger,
+ # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html
+ BigInteger().with_variant(Integer, "sqlite"),
primary_key=True,
autoincrement=True,
comment="PK of NLP record (no specific use)",
diff --git a/crate_anon/nlp_manager/nlp_definition.py b/crate_anon/nlp_manager/nlp_definition.py
index 5940a416..643f8e12 100644
--- a/crate_anon/nlp_manager/nlp_definition.py
+++ b/crate_anon/nlp_manager/nlp_definition.py
@@ -175,7 +175,7 @@ def _make_proclist(
hashphrase = "doesnotmatter"
if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS"
if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES"
- inputfields = f"{if_clin_docs}\n" f" {if_prog_notes}"
+ inputfields = f"{if_clin_docs}\n {if_prog_notes}"
truncate_text_at = "32766"
my_env = "MY_ENV_SECTION"
my_src_db = "SOURCE_DATABASE"
@@ -814,7 +814,7 @@ def _make_proclist(
{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp
{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}
-""" # noqa
+""" # noqa: E501
# =============================================================================
diff --git a/crate_anon/nlp_manager/nlp_manager.py b/crate_anon/nlp_manager/nlp_manager.py
index 723b5341..75378ecf 100755
--- a/crate_anon/nlp_manager/nlp_manager.py
+++ b/crate_anon/nlp_manager/nlp_manager.py
@@ -58,7 +58,7 @@
mysqld about 18% [from top]
nlp_manager.py about 4-5% * 8 [from top]
-""" # noqa
+""" # noqa: E501
# =============================================================================
# Imports
@@ -113,37 +113,36 @@
BaseNlpParser,
TextProcessingFailed,
)
+from crate_anon.nlp_manager.cloud_parser import Cloud
+from crate_anon.nlp_manager.cloud_request import (
+ CloudRequest,
+ CloudRequestListProcessors,
+ CloudRequestProcess,
+ CloudRequestQueueManagement,
+ extract_nlprp_top_level_results,
+ parse_nlprp_docresult_metadata,
+)
from crate_anon.nlp_manager.cloud_request_sender import CloudRequestSender
+from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo
from crate_anon.nlp_manager.constants import (
DEFAULT_REPORT_EVERY_NLP,
- MAX_STRING_PK_LENGTH,
- NLP_CONFIG_ENV_VAR,
- NlpDefConfigKeys,
-)
-from crate_anon.nlp_manager.input_field_config import (
- InputFieldConfig,
FN_SRCDB,
- FN_SRCTABLE,
+ FN_SRCFIELD,
FN_SRCPKFIELD,
- FN_SRCPKVAL,
FN_SRCPKSTR,
- FN_SRCFIELD,
+ FN_SRCPKVAL,
+ FN_SRCTABLE,
+ MAX_STRING_PK_LENGTH,
+ NLP_CONFIG_ENV_VAR,
+ NlpDefConfigKeys,
TRUNCATED_FLAG,
)
-from crate_anon.nlp_manager.models import FN_SRCHASH, NlpRecord
+from crate_anon.nlp_manager.input_field_config import InputFieldConfig
+from crate_anon.nlp_manager.models import NlpRecord
from crate_anon.nlp_manager.nlp_definition import (
NlpDefinition,
demo_nlp_config,
)
-from crate_anon.nlp_manager.cloud_parser import Cloud
-from crate_anon.nlp_manager.cloud_request import (
- CloudRequest,
- CloudRequestListProcessors,
- CloudRequestProcess,
- CloudRequestQueueManagement,
-)
-from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo
-from crate_anon.nlprp.constants import NlprpKeys as NKeys
from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE
# from crate_anon.common.profiling import do_cprofile
@@ -750,7 +749,6 @@ def retrieve_nlp_data(crinfo: CloudRunInfo, incremental: bool = False) -> None:
ifconfig_cache[if_section] = ifconfig
seen_srchashs = [] # type: List[str]
cloud_request = CloudRequestProcess(crinfo=crinfo)
- # cloud_request.set_mirror_processors(mirror_procs)
cloud_request.set_queue_id(queue_id)
log.info(f"Atempting to retrieve data from request #{i} ...")
i += 1
@@ -761,21 +759,20 @@ def retrieve_nlp_data(crinfo: CloudRunInfo, incremental: bool = False) -> None:
if not ready:
# If results are not ready for this particular queue_id, put
# back in file
- # request_data.write(f"{if_section},{queue_id}\n")
remaining_data.append(f"{if_section},{queue_id}\n")
all_ready = False
else:
- nlp_data = cloud_request.nlp_data
- for result in nlp_data[NKeys.RESULTS]:
+ docresultlist = extract_nlprp_top_level_results(
+ cloud_request.nlp_data
+ )
+ for result in docresultlist:
# There are records associated with the given queue_id
records_exist = True
uncommitted_data = True
# 'metadata' is just 'other_values' from before
- metadata = result[NKeys.METADATA] # type: Dict[str, Any]
- # ... expected type because that's what we sent; see add_text()
- pkval = metadata[FN_SRCPKVAL]
- pkstr = metadata[FN_SRCPKSTR]
- srchash = metadata[FN_SRCHASH]
+ _, pkval, pkstr, srchash = parse_nlprp_docresult_metadata(
+ result
+ )
progrec = None
if incremental:
progrec = ifconfig.get_progress_record(pkval, pkstr)
@@ -865,14 +862,17 @@ def process_cloud_now(
for cloud_request in cloud_requests:
if cloud_request.request_failed:
continue
+ # (a) handle the actual data
cloud_request.process_all()
- nlp_data = cloud_request.nlp_data
- for result in nlp_data[NKeys.RESULTS]:
+ # (b) handle the progress records
+ docresultlist = extract_nlprp_top_level_results(
+ cloud_request.nlp_data
+ )
+ for result in docresultlist:
# 'metadata' is just 'other_values' from before
- metadata = result[NKeys.METADATA]
- pkval = metadata[FN_SRCPKVAL]
- pkstr = metadata[FN_SRCPKSTR]
- srchash = metadata[FN_SRCHASH]
+ _, pkval, pkstr, srchash = parse_nlprp_docresult_metadata(
+ result
+ )
progrec = None
if incremental:
# A word of explanation: to get here, the record must
@@ -1078,7 +1078,7 @@ def test_nlp_stdin(nlpdef: NlpDefinition) -> None:
)
procreq.add_text(text, metadata={})
procreq.send_process_request(queue=False)
- results = procreq.nlp_data[NKeys.RESULTS]
+ results = extract_nlprp_top_level_results(procreq.nlp_data)
result_found = True
# ... may not really be true, but we have something to show
formatted_results = json.dumps(results, indent=JSON_INDENT)
diff --git a/crate_anon/nlp_manager/output_user_config.py b/crate_anon/nlp_manager/output_user_config.py
index 8340fde1..a05e45ac 100644
--- a/crate_anon/nlp_manager/output_user_config.py
+++ b/crate_anon/nlp_manager/output_user_config.py
@@ -65,14 +65,15 @@
class OutputUserConfig:
"""
- Class defining configuration for the output of a given GATE app.
+ Class defining configuration for the output of a given GATE app, or remote
+ cloud app.
See the documentation for the :ref:`NLP config file `.
"""
def __init__(
self,
- parser: ExtendedConfigParser,
+ config_parser: ExtendedConfigParser,
cfg_output_name: str,
schema_required: bool = True,
) -> None:
@@ -80,7 +81,7 @@ def __init__(
Read config from a configparser section.
Args:
- parser:
+ config_parser:
:class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser`
cfg_output_name:
config file section name suffix -- this is the second of the
@@ -89,18 +90,17 @@ def __init__(
- :ref:`NLP config file `
- :class:`crate_anon.nlp_manager.parse_gate.Gate`
-
- schema_required:
- is it required that the user has specified a schema, i.e.
- destfields and a desttable? - Should be true for Gate, False
- for Cloud as the remote processors may have their own schema
- definition.
+ schema_required:
+ is it required that the user has specified a schema, i.e.
+ destfields and a desttable? - Should be true for Gate, False
+ for Cloud as the remote processors may have their own schema
+ definition.
"""
sectionname = full_sectionname(
NlpConfigPrefixes.OUTPUT, cfg_output_name
)
- cfg = ConfigSection(section=sectionname, parser=parser)
+ cfg = ConfigSection(section=sectionname, parser=config_parser)
# ---------------------------------------------------------------------
# desttable
@@ -129,10 +129,10 @@ def __init__(
f"section {sectionname!r}; line was {line!r} but should "
f"have contained two things"
)
- annotation_name = words[0]
- field_name = words[1]
- ensure_valid_field_name(field_name)
- self._renames[annotation_name] = field_name
+ annotation_or_remote_column_name = words[0]
+ to_column_name = words[1]
+ ensure_valid_field_name(to_column_name)
+ self._renames[annotation_or_remote_column_name] = to_column_name
# ---------------------------------------------------------------------
# null_literals
@@ -158,7 +158,6 @@ def __init__(
as_words=False,
)
# ... comments will be removed during that process.
- # log.critical(dest_field_lines)
# If dest_field_lines is empty (as it may be for a Cloud processor)
# the following block doesn't execute, so the 'dest' attributed remain
# empty
@@ -279,7 +278,8 @@ def indexes(self) -> List[Index]:
def renames(self) -> Dict[str, str]:
"""
Return the "rename dictionary": a dictionary mapping GATE annotation
- names to fieldnames in the NLP destination table.
+ names (or cloud remote column names) to local field (column) names in
+ the NLP destination table.
See
diff --git a/crate_anon/nlp_manager/parse_clinical.py b/crate_anon/nlp_manager/parse_clinical.py
index af931e79..4fb75f30 100644
--- a/crate_anon/nlp_manager/parse_clinical.py
+++ b/crate_anon/nlp_manager/parse_clinical.py
@@ -642,7 +642,7 @@ class Bp(BaseNlpParser):
# Since we produce two variables, SBP and DBP, and we use something a
# little more complex than
- # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa
+ # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa: E501
# we subclass :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
# directly.)
diff --git a/crate_anon/nlp_manager/parse_cognitive.py b/crate_anon/nlp_manager/parse_cognitive.py
index 8f11cc4b..db553e06 100644
--- a/crate_anon/nlp_manager/parse_cognitive.py
+++ b/crate_anon/nlp_manager/parse_cognitive.py
@@ -153,7 +153,7 @@ class Ace(NumeratorOutOfDenominatorParser):
) \b
)?+
{WORD_BOUNDARY} )
- """ # noqa
+ """ # noqa: E501
# ... note the possessive "?+" above; see tests below.
def __init__(
diff --git a/crate_anon/nlp_manager/parse_gate.py b/crate_anon/nlp_manager/parse_gate.py
index 7e887f77..7a5d18b9 100644
--- a/crate_anon/nlp_manager/parse_gate.py
+++ b/crate_anon/nlp_manager/parse_gate.py
@@ -52,7 +52,7 @@
from crate_anon.nlp_manager.constants import (
MAX_SQL_FIELD_LEN,
ProcessorConfigKeys,
- GateFieldNames as GateFN,
+ GateFieldNames,
)
from crate_anon.nlp_manager.nlp_definition import (
NlpDefinition,
@@ -173,12 +173,23 @@ def __init__(
ProcessorConfigKeys.PROGARGS, required=True
)
logtag = nlpdef.logtag or "."
+ # Also, ensure the user doesn't specify desttable (would be
+ # confusing).
+ if self._cfgsection.opt_str(ProcessorConfigKeys.DESTTABLE):
+ raise ValueError(
+ f"For GATE processors, don't specify "
+ f"{ProcessorConfigKeys.DESTTABLE!r}; table information is "
+ f"in {ProcessorConfigKeys.OUTPUTTYPEMAP!r}"
+ )
self._outputtypemap = {} # type: Dict[str, OutputUserConfig]
self._type_to_tablename = {} # type: Dict[str, str]
for annottype, outputsection in chunks(typepairs, 2):
annottype = annottype.lower()
- c = OutputUserConfig(nlpdef.parser, outputsection)
+ c = OutputUserConfig(
+ config_parser=nlpdef.parser,
+ cfg_output_name=outputsection,
+ )
self._outputtypemap[annottype] = c
self._type_to_tablename[annottype] = c.dest_tablename
@@ -318,7 +329,7 @@ def parse(
d = tsv_pairs_to_dict(line)
log.debug(f"dictionary received: {d}")
try:
- annottype = d[GateFN.TYPE].lower()
+ annottype = d[GateFieldNames.TYPE].lower()
except KeyError:
raise ValueError("_type information not in data received")
if annottype not in self._type_to_tablename:
@@ -375,8 +386,9 @@ def dest_tables_indexes(self) -> Dict[str, List[Index]]:
# docstring in superclass
tables = {} # type: Dict[str, List[Index]]
for anottype, otconfig in self._outputtypemap.items():
- tables[otconfig.dest_tablename] = (
- self._standard_gate_indexes() + otconfig.indexes
+ dest_tablename = otconfig.dest_tablename
+ tables[dest_tablename] = (
+ self._standard_gate_indexes(dest_tablename) + otconfig.indexes
)
return tables
diff --git a/crate_anon/nlp_manager/parse_haematology.py b/crate_anon/nlp_manager/parse_haematology.py
index 5468823c..134c69b7 100644
--- a/crate_anon/nlp_manager/parse_haematology.py
+++ b/crate_anon/nlp_manager/parse_haematology.py
@@ -96,7 +96,7 @@ class Haemoglobin(SimpleNumericalResultParser):
old-style units, 9 g/dL = 90 g/L, but this will be interpreted as 9 g/L.
This problem is hard to avoid.
- """ # noqa
+ """ # noqa: E501
HAEMOGLOBIN_BASE = rf"""
{WORD_BOUNDARY} (?: Ha?emoglobin | Hb | HGB ) {WORD_BOUNDARY}
diff --git a/crate_anon/nlp_manager/parse_medex.py b/crate_anon/nlp_manager/parse_medex.py
index 385284f6..bf03a122 100644
--- a/crate_anon/nlp_manager/parse_medex.py
+++ b/crate_anon/nlp_manager/parse_medex.py
@@ -388,7 +388,7 @@
Therefore, we should have a Java parameter specified in the config file as
``-Dfile.encoding=UTF-8``.
-""" # noqa
+""" # noqa: E501
import logging
import os
diff --git a/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py b/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py
index 6d06752d..2a17ae2e 100755
--- a/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py
+++ b/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py
@@ -138,7 +138,7 @@
EXIT_SUCCESS,
)
-_ = """
+_ = r"""
===============================================================================
My launch command:
@@ -339,7 +339,7 @@
- set JVM limit to 10 GB (meaning it uses 13.9 GB) rather than 30 GB (making
it thrash, use 36 GB, and get killed).
-""" # noqa
+""" # noqa: E501
log = logging.getLogger(__name__)
@@ -372,7 +372,7 @@
# - FROM /tmp/tmpg1z_uldg/umls_unzipped
# - TO /tmp/tmpg1z_uldg/blah
# and using "include" rather than "exclude" mode, we got this:
-_NOTES_WORKING_CONFIG_FROM_GUI = """
+_NOTES_WORKING_CONFIG_FROM_GUI = r"""
# Configuration Properties File
#Mon Sep 14 17:45:02 BST 2020
mmsys_output_stream=gov.nih.nlm.umls.mmsys.io.RRFMetamorphoSysOutputStream
@@ -413,7 +413,7 @@
gov.nih.nlm.umls.mmsys.io.NLMFileMetamorphoSysInputStream.meta_source_uri=/tmp/tmpg1z_uldg/umls_unzipped
gov.nih.nlm.umls.mmsys.filter.SuppressibleFilter.confirm_default_suppressible_sabttys=HPO|OP;HPO|IS;HPO|OET;NEU|ACR;NEU|OP;NEU|IS;FMA|AB;FMA|OP;MDR|AB;CDT|OP;ICD10AE|PS;ICD10|PS;ICD10AMAE|PS;ICD10AM|PS;NCI|OP;NCI_NICHD|OP;ICPC|PS;ICPC|CS;MTHICPC2EAE|AB;ICPC2EENG|AB;ICPC2P|MTH_PT;ICPC2P|OPN;ICPC2P|MTH_OPN;ICPC2P|OP;ICPC2P|MTH_OP;HCPCS|OP;HCDT|OP;HCPT|OP;HCPCS|OM;HCPCS|OAM;GO|OP;GO|MTH_OP;GO|OET;GO|MTH_OET;GO|IS;GO|MTH_IS;PDQ|OP;PDQ|IS;NCBI|AUN;NCBI|UAUN;LNC|OLC;LNC|LO;LNC|MTH_LO;LNC|OOSN;MDR|OL;MDR|MTH_OL;ICD10PCS|HS;ICD10PCS|AB;ICD10AE|HS;ICD10|HS;HL7V3.0|OP;HL7V3.0|ONP;ICD10CM|AB;ICD9CM|AB;MSH|DEV;MSH|DSV;MSH|QAB;MSH|QEV;MSH|QSV;CPT|AB;HCPT|AB;HCPCS|AB;SNMI|PX;SNMI|HX;SNMI|SX;RCD|OP;RCD|IS;RCD|AS;RCD|AB;RCDSA|OP;RCDSY|OP;RCDAE|OP;RCDSA|IS;RCDSY|IS;RCDAE|IS;RCDSA|AB;RCDSY|AB;RCDAE|AB;RCDSA|OA;RCDSY|OA;RCDAE|OA;RCD|OA;RCDAE|AA;RCD|AA;HCPT|OA;HCPT|AM;HCPCS|OA;HCPCS|AM;HCDT|AB;ALT|AB;HCDT|OA;SNOMEDCT_VET|OAP;SNOMEDCT_VET|OP;SNOMEDCT_US|OAP;SNOMEDCT_US|OP;SNOMEDCT_VET|OAF;SNOMEDCT_VET|OF;SNOMEDCT_US|OAF;SNOMEDCT_US|OF;SNOMEDCT_VET|OAS;SNOMEDCT_VET|IS;SNOMEDCT_US|OAS;SNOMEDCT_US|IS;SNOMEDCT_US|MTH_OAP;SNOMEDCT_US|MTH_OP;SNOMEDCT_US|MTH_OAF;SNOMEDCT_US|MTH_OF;SNOMEDCT_US|MTH_OAS;SNOMEDCT_US|MTH_IS;CCPSS|TC;SCTSPA|OP;SCTSPA|OAF;SCTSPA|OAP;SCTSPA|OAS;SCTSPA|OF;SCTSPA|IS;SCTSPA|MTH_OP;SCTSPA|MTH_OAF;SCTSPA|MTH_OAP;SCTSPA|MTH_OAS;SCTSPA|MTH_OF;SCTSPA|MTH_IS;MSHNOR|DSV;MSHGER|DSV;MDRSPA|OL;MDRSPA|AB;MDRDUT|OL;MDRDUT|AB;MDRFRE|OL;MDRFRE|AB;MDRGER|OL;MDRGER|AB;MDRITA|OL;MDRITA|AB;MDRJPN|OL;MDRJPN|OLJKN;MDRJPN|OLJKN1;MDRCZE|OL;MDRHUN|OL;MDRPOR|OL;MDRCZE|AB;MDRHUN|AB;MDRPOR|AB;LNC-DE-CH|OOSN;LNC-DE-DE|LO;LNC-EL-GR|LO;LNC-ES-AR|LO;LNC-ES-AR|OOSN;LNC-ES-CH|OOSN;LNC-ES-ES|LO;LNC-ET-EE|LO;LNC-FR-BE|LO;LNC-FR-CA|LO;LNC-FR-CH|OOSN;LNC-FR-FR|OLC;LNC-FR-FR|LO;LNC-IT-CH|OOSN;LNC-IT-IT|LO;LNC-KO-KR|LO;LNC-NL-NL|LO;LNC-PT-BR|OLC;LNC-PT-BR|LO;LNC-PT-BR|OOSN;LNC-RU-RU|LO;LNC-TR-TR|LO;LNC-ZH-CN|LO;LNC-DE-AT|LO
gov.nih.nlm.umls.mmsys.filter.SourceListFilter.enforce_dep_source_selection=true
-""" # noqa
+"""
# =============================================================================
@@ -524,7 +524,7 @@ def get_mmsys_configfile_text(
awk -F '\\|' '{print $1"|"$1}' |
perl -pe 's/\n/;/g' \
> /tmp/sab_list.txt
- """ # noqa
+ """ # noqa: E501
# Config file that we'll use as a starting point
start_config_filename = join(mmsys_home, "config", release, "user.a.prop")
diff --git a/crate_anon/nlp_manager/processor_helpers.py b/crate_anon/nlp_manager/processor_helpers.py
new file mode 100644
index 00000000..752de8c1
--- /dev/null
+++ b/crate_anon/nlp_manager/processor_helpers.py
@@ -0,0 +1,69 @@
+"""
+crate_anon/nlp_manager/processor_helpers.py
+
+===============================================================================
+
+ Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
+ Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
+
+ This file is part of CRATE.
+
+ CRATE is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ CRATE is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with CRATE. If not, see .
+
+===============================================================================
+
+**Helper functions to manage all NLP processor classes.**
+
+These use a delayed import, sorting out some circular import problems.
+
+"""
+
+# =============================================================================
+# Imports
+# =============================================================================
+
+from typing import Optional
+
+from crate_anon.nlp_manager.base_nlp_parser import TableMaker
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+
+def make_nlp_parser_unconfigured(
+ classname: str, raise_if_absent: bool = True
+) -> Optional[TableMaker]:
+ """
+ Get a debugging (unconfigured) instance of an NLP parser.
+
+ Args:
+ classname: the name of the NLP parser class
+ raise_if_absent: raise ``ValueError`` if there is no match?
+
+ Returns:
+ the class, or ``None`` if there isn't one with that name
+
+ """
+ from crate_anon.nlp_manager.all_processors import (
+ get_nlp_parser_class,
+ ) # delayed import
+
+ cls = get_nlp_parser_class(classname)
+ if cls:
+ return cls(nlpdef=None, cfg_processor_name=None)
+ if raise_if_absent:
+ raise ValueError(f"Unknown NLP processor type: {classname!r}")
+ return None
diff --git a/crate_anon/nlp_manager/regex_parser.py b/crate_anon/nlp_manager/regex_parser.py
index aef4bdba..acc769cf 100644
--- a/crate_anon/nlp_manager/regex_parser.py
+++ b/crate_anon/nlp_manager/regex_parser.py
@@ -90,8 +90,8 @@
# "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting."
# ... which we don't want to be interpreted as "sodium 1.0".
# HOW BEST TO DO THIS?
-# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa
-# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa
+# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa: E501
+# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa: E501
# - ... simplest is perhaps: base ignorables, or those with brackets, as above
# - ... even better than a nested thing is just a list of alternatives
@@ -958,7 +958,7 @@ def __init__(
\s* {OUT_OF_SEPARATOR} \s*
( {IGNORESIGN_INTEGER} ) # 5. group for denominator
)?
- """ # noqa
+ """ # noqa: E501
if debug:
log.debug(f"Regex for {self.classname()}: {regex_str}")
self.regex_str = regex_str
diff --git a/crate_anon/nlp_manager/tests/cloud_request_process_tests.py b/crate_anon/nlp_manager/tests/cloud_request_process_tests.py
index 679d1937..2e4d03d5 100644
--- a/crate_anon/nlp_manager/tests/cloud_request_process_tests.py
+++ b/crate_anon/nlp_manager/tests/cloud_request_process_tests.py
@@ -25,19 +25,58 @@
Unit tests.
+Reminder: to enable logging, use e.g. pytest -k [testname] --log-cli-level=INFO
+
"""
+import json
import logging
+import os
+from pathlib import Path
import sys
from unittest import mock, TestCase
+from tempfile import TemporaryDirectory
+from typing import Any, Dict
from cardinal_pythonlib.httpconst import HttpStatus
+from sqlalchemy.engine import create_engine
from sqlalchemy.exc import OperationalError
+from sqlalchemy.schema import Column
-from crate_anon.nlprp.constants import (
- NlprpKeys as NKeys,
+from crate_anon.nlp_manager.all_processors import (
+ register_all_crate_python_processors_with_serverprocessor,
+)
+from crate_anon.nlp_manager.cloud_parser import Cloud
+from crate_anon.nlp_manager.cloud_request import (
+ CloudRequest,
+ CloudRequestListProcessors,
+ CloudRequestProcess,
+)
+from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo
+from crate_anon.nlp_manager.constants import (
+ CloudNlpConfigKeys,
+ DatabaseConfigKeys,
+ InputFieldConfigKeys,
+ NLP_CONFIG_ENV_VAR,
+ NlpConfigPrefixes,
+ NlpDefConfigKeys,
+ NlpDefValues,
+ NlpOutputConfigKeys,
+ ProcessorConfigKeys,
)
-from crate_anon.nlp_manager.cloud_request import CloudRequestProcess
+from crate_anon.nlp_manager.nlp_definition import NlpDefinition
+from crate_anon.nlp_manager.nlp_manager import drop_remake, process_cloud_now
+from crate_anon.nlp_webserver.server_processor import ServerProcessor
+import crate_anon.nlp_webserver.tasks
+from crate_anon.nlp_webserver.views import NlpWebViews
+from crate_anon.nlprp.constants import NlprpKeys, NlprpValues
+
+log = logging.getLogger(__name__)
+
+
+# =============================================================================
+# CloudRequestProcessTests
+# =============================================================================
class CloudRequestProcessTests(TestCase):
@@ -76,10 +115,10 @@ def test_process_all_inserts_values(self) -> None:
("output", {"fruit": "fig"}, self.mock_processor),
]
- mock_get_nlp_values_method = mock.Mock(return_value=iter(nlp_values))
+ mock_gen_nlp_values_method = mock.Mock(return_value=iter(nlp_values))
with mock.patch.multiple(
- self.process, get_nlp_values=mock_get_nlp_values_method
+ self.process, gen_nlp_values=mock_gen_nlp_values_method
):
self.process.process_all()
@@ -118,10 +157,10 @@ def test_process_all_handles_failed_insert(self) -> None:
"Insert failed", None, None, None
)
- mock_get_nlp_values_method = mock.Mock(return_value=iter(nlp_values))
+ mock_gen_nlp_values_method = mock.Mock(return_value=iter(nlp_values))
with self.assertLogs(level=logging.ERROR) as logging_cm:
with mock.patch.multiple(
- self.process, get_nlp_values=mock_get_nlp_values_method
+ self.process, gen_nlp_values=mock_gen_nlp_values_method
):
self.process.process_all()
@@ -163,8 +202,8 @@ def test_ready_for_status_ok(self) -> None:
self.process.queue_id = "queue_0001"
response = {
- NKeys.STATUS: HttpStatus.OK,
- NKeys.VERSION: "0.3.0",
+ NlprpKeys.STATUS: HttpStatus.OK,
+ NlprpKeys.VERSION: "0.3.0",
}
with mock.patch.object(
@@ -177,8 +216,8 @@ def test_not_ready_when_old_server_status_processing(self) -> None:
self.process.queue_id = "queue_0001"
response = {
- NKeys.STATUS: HttpStatus.PROCESSING,
- NKeys.VERSION: "0.2.0",
+ NlprpKeys.STATUS: HttpStatus.PROCESSING,
+ NlprpKeys.VERSION: "0.2.0",
}
with mock.patch.object(
@@ -191,8 +230,8 @@ def test_not_ready_when_new_server_status_accepted(self) -> None:
self.process.queue_id = "queue_0001"
response = {
- NKeys.STATUS: HttpStatus.ACCEPTED,
- NKeys.VERSION: "0.3.0",
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: "0.3.0",
}
with mock.patch.object(
@@ -205,8 +244,8 @@ def test_not_ready_when_server_status_not_found(self) -> None:
self.process.queue_id = "queue_0001"
response = {
- NKeys.STATUS: HttpStatus.NOT_FOUND,
- NKeys.VERSION: "0.3.0",
+ NlprpKeys.STATUS: HttpStatus.NOT_FOUND,
+ NlprpKeys.VERSION: "0.3.0",
}
with mock.patch.object(
@@ -221,8 +260,8 @@ def test_not_ready_when_server_status_anything_else(self) -> None:
self.process.queue_id = "queue_0001"
response = {
- NKeys.STATUS: HttpStatus.FORBIDDEN,
- NKeys.VERSION: "0.3.0",
+ NlprpKeys.STATUS: HttpStatus.FORBIDDEN,
+ NlprpKeys.VERSION: "0.3.0",
}
with mock.patch.object(
@@ -232,3 +271,398 @@ def test_not_ready_when_server_status_anything_else(self) -> None:
ready = self.process.check_if_ready()
self.assertFalse(ready)
self.assertIn("Got HTTP status code 403", logging_cm.output[0])
+
+
+# =============================================================================
+# CloudRequestListProcessorsTests
+# =============================================================================
+
+# A real one that wasn't working, 2024-12-16, with keys parameterized and
+# boolean values Pythonized.
+TEST_REMOTE_TABLE_SMOKING = "Smoking:Smoking"
+TEST_SMOKING_PROC_NAME = "smoking"
+TEST_SMOKING_PROC_VERSION = "0.1"
+TEST_SMOKING_PROCINFO = {
+ NlprpKeys.DESCRIPTION: "A description",
+ NlprpKeys.IS_DEFAULT_VERSION: True,
+ NlprpKeys.NAME: TEST_SMOKING_PROC_NAME,
+ NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR,
+ NlprpKeys.SQL_DIALECT: "mssql",
+ NlprpKeys.TABULAR_SCHEMA: {
+ TEST_REMOTE_TABLE_SMOKING: [
+ {
+ NlprpKeys.COLUMN_NAME: "start_",
+ NlprpKeys.COLUMN_TYPE: "BIGINT",
+ NlprpKeys.DATA_TYPE: "BIGINT",
+ NlprpKeys.IS_NULLABLE: False,
+ },
+ {
+ NlprpKeys.COLUMN_NAME: "end_",
+ NlprpKeys.COLUMN_TYPE: "BIGINT",
+ NlprpKeys.DATA_TYPE: "BIGINT",
+ NlprpKeys.IS_NULLABLE: False,
+ },
+ {
+ NlprpKeys.COLUMN_NAME: "who",
+ NlprpKeys.COLUMN_TYPE: "NVARCHAR(255)",
+ NlprpKeys.DATA_TYPE: "NVARCHAR",
+ NlprpKeys.IS_NULLABLE: True,
+ },
+ {
+ NlprpKeys.COLUMN_NAME: "rule",
+ NlprpKeys.COLUMN_TYPE: "VARCHAR(50)",
+ NlprpKeys.DATA_TYPE: "VARCHAR",
+ NlprpKeys.IS_NULLABLE: True,
+ },
+ {
+ NlprpKeys.COLUMN_NAME: "status",
+ NlprpKeys.COLUMN_TYPE: "VARCHAR(10)",
+ NlprpKeys.DATA_TYPE: "VARCHAR",
+ NlprpKeys.IS_NULLABLE: True,
+ },
+ ]
+ },
+ NlprpKeys.TITLE: "Smoking Status Annotator",
+ NlprpKeys.VERSION: TEST_SMOKING_PROC_VERSION,
+}
+
+
+class CloudRequestListProcessorsTests(TestCase):
+ def setUp(self) -> None:
+ self.mock_nlpdef = mock.Mock(name="mock_nlpdef")
+ self.mock_nlpdef.name = "testlistprocdef"
+ self.process = CloudRequestListProcessors(nlpdef=self.mock_nlpdef)
+ self.test_version = "0.3.0"
+
+ def test_processors_key_missing(self) -> None:
+ response = {
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: self.test_version,
+ # Missing: NKeys.PROCESSORS
+ }
+ with mock.patch.object(
+ self.process, "_post_get_json", return_value=response
+ ):
+ with self.assertRaises(KeyError):
+ self.process.get_remote_processors()
+
+ def test_processors_not_list(self) -> None:
+ response = {
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: self.test_version,
+ NlprpKeys.PROCESSORS: "XXX", # not a list
+ }
+ with mock.patch.object(
+ self.process, "_post_get_json", return_value=response
+ ):
+ with self.assertRaises(ValueError):
+ self.process.get_remote_processors()
+
+ def test_procinfo_not_dict(self) -> None:
+ procinfo = "xxx" # not a dict
+ response = {
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: self.test_version,
+ NlprpKeys.PROCESSORS: [procinfo],
+ }
+ with mock.patch.object(
+ self.process, "_post_get_json", return_value=response
+ ):
+ with self.assertRaises(ValueError):
+ self.process.get_remote_processors()
+
+ def test_procinfo_missing_keys(self) -> None:
+ mandatory_keys = (
+ NlprpKeys.NAME,
+ NlprpKeys.TITLE,
+ NlprpKeys.VERSION,
+ NlprpKeys.DESCRIPTION,
+ )
+ base_procinfo = {k: "x" for k in mandatory_keys}
+ for key in mandatory_keys:
+ procinfo = base_procinfo.copy()
+ del procinfo[key]
+ response = {
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: self.test_version,
+ NlprpKeys.PROCESSORS: [procinfo],
+ }
+ with mock.patch.object(
+ self.process, "_post_get_json", return_value=response
+ ):
+ with self.assertRaises(KeyError):
+ self.process.get_remote_processors()
+
+ def test_procinfo_smoking(self) -> None:
+ response = {
+ NlprpKeys.STATUS: HttpStatus.ACCEPTED,
+ NlprpKeys.VERSION: self.test_version,
+ NlprpKeys.PROCESSORS: [TEST_SMOKING_PROCINFO],
+ }
+ with mock.patch.object(
+ self.process, "_post_get_json", return_value=response
+ ):
+ self.process.get_remote_processors()
+ # Should be happy.
+ # Clean up:
+ ServerProcessor.debug_remove_processor(
+ name=TEST_SMOKING_PROC_NAME, version=TEST_SMOKING_PROC_VERSION
+ )
+
+
+# =============================================================================
+# CloudRequestDataTests
+# =============================================================================
+
+
+class CloudRequestDataTests(TestCase):
+ def setUp(self) -> None:
+ # On-disk database
+ self.tempdir = TemporaryDirectory() # will be deleted automatically
+ self.dbfilepath = Path(self.tempdir.name, "test.sqlite")
+ log.info(f"Using temporary database: {self.dbfilepath}")
+ self.dburl = f"sqlite:///{self.dbfilepath.absolute()}"
+ self.txttable = "notes"
+ self.pkcol = "pk"
+ self.txtcol = "note"
+ self.echo = False
+ self._mk_test_data()
+
+ # Dummy database
+ self.dummy_engine = create_engine("sqlite://")
+
+ # Config file
+ self.nlpdefname = "mynlp"
+ self.dbsectionname = "mydb"
+ self.cloudconfigname = "mycloud"
+ self.inputname = "myinput"
+ self.cloudclassname = "Cloud"
+ self.cloudproc_crp = "proc_crp"
+ self.cloudproc_alcohol = "proc_alcohol"
+ self.output_crp = "crp_output"
+ self.output_alcohol = "alcohol_output"
+ self.configfilepath = Path(self.tempdir.name, "crate_test_nlp.ini")
+ with open(self.configfilepath, "wt") as f:
+ configtext = self._mk_nlp_config()
+ log.debug(configtext)
+ f.write(configtext)
+ # import pdb; pdb.set_trace()
+
+ # Server side
+ register_all_crate_python_processors_with_serverprocessor()
+ self.mock_pyramid_request = mock.Mock(name="mock_pyramid_request")
+ self.server = NlpWebViews(request=self.mock_pyramid_request)
+ self.server._authenticate = mock.Mock()
+ self.server._set_body_json_from_request = mock.Mock(
+ name="mock_set_body_json_from_request"
+ )
+
+ # Rather than modify the instances, let's try to modify the class. This
+ # is because process_now() does its own instance creation.
+ # (CloudRequest is the base class of CloudRequestProcess and
+ # CloudRequestListProcessors.)
+ CloudRequest._post_get_json = self._get_server_response
+
+ # Client side #1
+ self.mock_nlpdef = mock.Mock()
+ self.mock_nlpdef.name = "testdef"
+ self.listprocclient = CloudRequestListProcessors(
+ nlpdef=self.mock_nlpdef
+ )
+
+ # Client side #2
+ os.environ[NLP_CONFIG_ENV_VAR] = str(self.configfilepath.absolute())
+ self.nlpdef = NlpDefinition(self.nlpdefname) # loads the config
+ self.crinfo = CloudRunInfo(nlpdef=self.nlpdef)
+
+ def _mk_nlp_config(self) -> str:
+ """
+ Returns a test NLP config file.
+ """
+ return f"""
+# NLP definitions
+
+[{NlpConfigPrefixes.NLPDEF}:{self.nlpdefname}]
+{NlpDefConfigKeys.INPUTFIELDDEFS} =
+ {self.inputname}
+{NlpDefConfigKeys.PROCESSORS} =
+ {self.cloudclassname} {self.cloudproc_crp}
+ {self.cloudclassname} {self.cloudproc_alcohol}
+{NlpDefConfigKeys.PROGRESSDB} = {self.dbsectionname}
+{NlpDefConfigKeys.HASHPHRASE} = blah
+{NlpDefConfigKeys.CLOUD_CONFIG} = {self.cloudconfigname}
+{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {self.tempdir.name}
+
+# Inputs
+
+[{NlpConfigPrefixes.INPUT}:{self.inputname}]
+{InputFieldConfigKeys.SRCDB} = {self.dbsectionname}
+{InputFieldConfigKeys.SRCTABLE} = {self.txttable}
+{InputFieldConfigKeys.SRCPKFIELD} = {self.pkcol}
+{InputFieldConfigKeys.SRCFIELD} = {self.txtcol}
+
+# Processors
+
+# - CRP
+[{NlpConfigPrefixes.PROCESSOR}:{self.cloudproc_crp}]
+{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp
+{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}
+{ProcessorConfigKeys.OUTPUTTYPEMAP} =
+ crp {self.output_crp}
+{ProcessorConfigKeys.DESTDB} = {self.dbsectionname}
+
+# - Alcohol units
+[{NlpConfigPrefixes.PROCESSOR}:{self.cloudproc_alcohol}]
+{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_substance_misuse.AlcoholUnits
+{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}
+{ProcessorConfigKeys.OUTPUTTYPEMAP} =
+ alcoholunits {self.output_alcohol}
+{ProcessorConfigKeys.DESTDB} = {self.dbsectionname}
+
+# Output sections
+
+# - CRP
+[{NlpConfigPrefixes.OUTPUT}:{self.output_crp}]
+{NlpOutputConfigKeys.DESTTABLE} = nlp_crp
+
+# - Alcohol units
+[{NlpConfigPrefixes.OUTPUT}:{self.output_alcohol}]
+{NlpOutputConfigKeys.DESTTABLE} = nlp_alcohol
+
+# Databases
+
+[{NlpConfigPrefixes.DATABASE}:{self.dbsectionname}]
+{DatabaseConfigKeys.URL} = {self.dburl}
+{DatabaseConfigKeys.ECHO} = {self.echo}
+
+# Cloud servers
+
+[{NlpConfigPrefixes.CLOUD}:{self.cloudconfigname}]
+{CloudNlpConfigKeys.CLOUD_URL} = https://dummy_url
+
+""" # noqa: E501
+
+ def _mk_test_data(self) -> None:
+ """
+ Inserts some test data into a table.
+ """
+ texts = ["Current CRP 7. Teetotal. Non-smoker."]
+ engine = create_engine(self.dburl)
+ with engine.connect() as con:
+ con.execute(
+ f"""
+ CREATE TABLE {self.txttable} (
+ {self.pkcol} INTEGER,
+ {self.txtcol} TEXT
+ )
+ """
+ )
+ for i, text in enumerate(texts, start=1):
+ con.execute(
+ f"""
+ INSERT INTO {self.txttable}
+ ({self.pkcol}, {self.txtcol})
+ VALUES(:1, :2)
+ """,
+ i,
+ text,
+ )
+
+ # noinspection PyUnusedLocal
+ def _get_server_response(
+ self, request_json_str: str, may_fail: bool = None
+ ) -> Dict[str, Any]:
+ """
+ Take a JSON request that has come from our mock client (in string
+ form), and return a JSON response from our mock server (in dictionary
+ form).
+ """
+ request_json = json.loads(request_json_str)
+ log.warning(f"{request_json=}") # ***
+ self.server.body = request_json
+ response_json = self.server.index()
+ log.warning(f"-> {response_json=}") # ***
+ return response_json
+
+ def test_get_remote_processor_columns(self) -> None:
+ """
+ Check that a client can request processor definitions from the server
+ (testing both ends, just simplifying some communication between them,
+ e.g. removing authentication). Check that the client can synthesise
+ SQLAlchemy Column objects from the results.
+ """
+ # Fetch the data
+ processors = self.listprocclient.get_remote_processors()
+ # Check it
+ self.assertIsInstance(processors, list)
+ for sp in processors:
+ self.assertIsInstance(sp, ServerProcessor)
+ log.debug(f"+++ Trying {sp.name=}")
+
+ # We won't go so far as to set up a mock database in full. But
+ # check that Column object are created.
+ # (a) setup
+ c = Cloud(nlpdef=None, cfg_processor_name=None)
+ c.procname = sp.name
+ c._destdb = mock.Mock(name="mock_destdb")
+ c._destdb.engine = self.dummy_engine
+ c.set_procinfo_if_correct(sp)
+ log.debug(f"--- {c.schema=}")
+ for tablename in c.schema.keys():
+ ouc = mock.Mock(name=f"mock_ouc_{tablename}")
+ ouc.get_columns = lambda _engine: []
+ ouc.renames = {}
+ ouc.dest_tablename = tablename # usually a property
+ ouc.destfields = []
+ # ... simulating OutputUserConfig
+ c._outputtypemap[tablename] = ouc
+ c._type_to_tablename[tablename] = tablename
+ # (b) test
+ self.assertTrue(c.is_tabular())
+ table_columns = c.dest_tables_columns()
+ self.assertTrue(len(table_columns) > 0)
+ for tablename, columns in table_columns.items():
+ log.debug(f"--- {sp.name=}: {tablename=}; {columns=}")
+ self.assertTrue(len(columns) > 0)
+ for col in columns:
+ self.assertIsInstance(col, Column)
+
+ def test_cloud_pipeline_dict_format(self) -> None:
+ """
+ Test the full pipeline:
+
+ - create a source database (in setUp);
+ - build a config file (in setUp);
+ - create destination tables, based on remote processor definitions
+ using tabular_schema;
+ - run data through cloud NLP, and insert results.
+
+ """
+ prev = crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT
+ crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = True
+ drop_remake(nlpdef=self.nlpdef)
+ process_cloud_now(crinfo=self.crinfo)
+ # The test is (currently) that it doesn't crash.
+ # Reset for other tests:
+ crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = prev
+
+ # To explore the database manually:
+ # import pdb; pdb.set_trace()
+
+ def test_cloud_pipeline_list_format(self) -> None:
+ """
+ Test the full pipeline:
+
+ - create a source database (in setUp);
+ - build a config file (in setUp);
+ - create destination tables, based on remote processor definitions
+ using tabular_schema;
+ - run data through cloud NLP, and insert results.
+
+ """
+ prev = crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT
+ crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = False
+ drop_remake(nlpdef=self.nlpdef)
+ process_cloud_now(crinfo=self.crinfo)
+ # Reset for other tests:
+ crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = prev
diff --git a/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py b/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py
index 8142ec4b..8a6c0e91 100644
--- a/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py
+++ b/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py
@@ -32,17 +32,17 @@
from crate_anon.nlp_manager.cloud_request import CloudRequestProcess
from crate_anon.nlp_manager.cloud_request_sender import CloudRequestSender
-from crate_anon.nlp_manager.constants import HashClass
-from crate_anon.nlp_manager.input_field_config import (
+from crate_anon.nlp_manager.constants import (
FN_SRCDB,
FN_SRCFIELD,
FN_SRCPKFIELD,
FN_SRCPKSTR,
FN_SRCPKVAL,
FN_SRCTABLE,
+ HashClass,
)
from crate_anon.nlp_manager.models import FN_SRCHASH, NlpRecord
-from crate_anon.nlprp.constants import NlprpKeys as NKeys
+from crate_anon.nlprp.constants import NlprpKeys
PANAMOWA = "A woman, a plan, a canal. Panamowa!"
PAGODA = "A dog! A panic in a pagoda."
@@ -163,14 +163,17 @@ def test_single_text_sent_in_single_request(self) -> None:
include_text_in_reply=True, # has_gate_processors from config
)
- records = cloud_requests[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
+ records = cloud_requests[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
- self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKVAL], 1)
- self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKSTR], "pkstr")
+ self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKVAL], 1)
+ self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKSTR], "pkstr")
self.assertEqual(
- records[0][NKeys.METADATA][FN_SRCHASH], self.hasher.hash(PANAMOWA)
+ records[0][NlprpKeys.METADATA][FN_SRCHASH],
+ self.hasher.hash(PANAMOWA),
)
- self.assertEqual(records[0][NKeys.TEXT], PANAMOWA)
+ self.assertEqual(records[0][NlprpKeys.TEXT], PANAMOWA)
def test_multiple_records_sent_in_single_request(self) -> None:
self.test_text = [
@@ -222,11 +225,13 @@ def test_multiple_records_sent_in_single_request(self) -> None:
include_text_in_reply=True, # has_gate_processors
)
- records = cloud_requests[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
+ records = cloud_requests[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
- self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKVAL], 1)
- self.assertEqual(records[1][NKeys.METADATA][FN_SRCPKSTR], "pkstr")
- self.assertEqual(records[2][NKeys.TEXT], REVOLT)
+ self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKVAL], 1)
+ self.assertEqual(records[1][NlprpKeys.METADATA][FN_SRCPKSTR], "pkstr")
+ self.assertEqual(records[2][NlprpKeys.TEXT], REVOLT)
def test_max_records_per_request(self) -> None:
self.test_text = [
@@ -318,14 +323,20 @@ def mock_send_0_side_effect(*args, **kwargs):
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA)
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA)
- content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_1[0][NKeys.TEXT], PAGODA)
+ content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_1[0][NlprpKeys.TEXT], PAGODA)
- content_2 = requests_out[2]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_2[0][NKeys.TEXT], REVOLT)
+ content_2 = requests_out[2]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_2[0][NlprpKeys.TEXT], REVOLT)
logger_name = "crate_anon.nlp_manager.cloud_request_sender"
expected_message = "Sent request to be processed: #1 of this block"
@@ -397,11 +408,13 @@ def test_limit_before_commit_2(self) -> None:
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
self.assertEqual(len(content_0), 2)
- self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA)
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA)
- self.assertEqual(content_0[1][NKeys.TEXT], PAGODA)
+ self.assertEqual(content_0[1][NlprpKeys.TEXT], PAGODA)
def test_max_content_length(self) -> None:
self.test_text = [
@@ -473,13 +486,17 @@ def test_max_content_length(self) -> None:
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA)
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA)
- self.assertEqual(content_0[1][NKeys.TEXT], PAGODA)
+ self.assertEqual(content_0[1][NlprpKeys.TEXT], PAGODA)
- content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_1[0][NKeys.TEXT], REVOLT)
+ content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_1[0][NlprpKeys.TEXT], REVOLT)
def test_record_bigger_than_max_content_length_skipped(self) -> None:
short_text = "Some text with serialized length greater than 500. "
@@ -553,12 +570,16 @@ def test_record_bigger_than_max_content_length_skipped(self) -> None:
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
self.assertEqual(len(content_0), 1)
- self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA)
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA)
- content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_1[0][NKeys.TEXT], REVOLT)
+ content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_1[0][NlprpKeys.TEXT], REVOLT)
logger_name = "crate_anon.nlp_manager.cloud_request_sender"
self.assertIn(
@@ -643,8 +664,10 @@ def get_progress_record(pkval: int, pkstr: str) -> Optional[NlpRecord]:
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
- self.assertEqual(content_0[0][NKeys.TEXT], PAGODA)
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PAGODA)
logger_name = "crate_anon.nlp_manager.cloud_request_sender"
self.assertIn(
@@ -821,8 +844,10 @@ def test_record_with_no_text_skipped(self) -> None:
include_text_in_reply=True, # has_gate_processors from config
)
- content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT]
+ content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][
+ NlprpKeys.CONTENT
+ ]
self.assertEqual(len(content_0), 2)
- self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA)
+ self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA)
- self.assertEqual(content_0[1][NKeys.TEXT], REVOLT)
+ self.assertEqual(content_0[1][NlprpKeys.TEXT], REVOLT)
diff --git a/crate_anon/nlp_manager/tests/nlp_parser_tests.py b/crate_anon/nlp_manager/tests/nlp_parser_tests.py
index fe04a843..e454bdc1 100644
--- a/crate_anon/nlp_manager/tests/nlp_parser_tests.py
+++ b/crate_anon/nlp_manager/tests/nlp_parser_tests.py
@@ -37,6 +37,8 @@
class FruitParser(BaseNlpParser):
+ is_test_nlp_parser = True
+
def test(self, verbose: bool = False) -> None:
pass
diff --git a/crate_anon/nlp_webserver/procs.py b/crate_anon/nlp_webserver/procs.py
index 77cb837d..3a223ca9 100644
--- a/crate_anon/nlp_webserver/procs.py
+++ b/crate_anon/nlp_webserver/procs.py
@@ -33,10 +33,7 @@
from crate_anon.common.constants import EnvVar
from crate_anon.nlprp.constants import NlprpKeys
-from crate_anon.nlp_webserver.constants import (
- KEY_PROCTYPE,
- NlpServerConfigKeys,
-)
+from crate_anon.nlp_webserver.constants import NlpServerConfigKeys
from crate_anon.nlp_webserver.server_processor import ServerProcessor
from crate_anon.nlp_webserver.settings import SETTINGS
@@ -57,16 +54,7 @@
_name = _proc[NlprpKeys.NAME]
_version = _proc[NlprpKeys.VERSION]
log.info(f"Registering NLPRP processor {_name!r} (v{_version})")
- _x = ServerProcessor(
- name=_name,
- title=_proc[NlprpKeys.TITLE],
- version=_version,
- is_default_version=_proc[NlprpKeys.IS_DEFAULT_VERSION],
- description=_proc[NlprpKeys.DESCRIPTION],
- proctype=_proc.get(KEY_PROCTYPE), # may be None
- schema_type=_proc[NlprpKeys.SCHEMA_TYPE], # 'unknown' or 'tabular'
- sql_dialect=_proc.get(NlprpKeys.SQL_DIALECT),
- tabular_schema=_proc.get(NlprpKeys.TABULAR_SCHEMA),
- ) # registers with the ServerProcessor class
- # Doing this here saves time per request
+ _x = ServerProcessor.from_nlprp_json_dict(_proc)
+ # ... registers with the ServerProcessor class
+ # Doing this here saves time per request:
_x.set_parser()
diff --git a/crate_anon/nlp_webserver/server_processor.py b/crate_anon/nlp_webserver/server_processor.py
index c2e92af4..541f0f4b 100644
--- a/crate_anon/nlp_webserver/server_processor.py
+++ b/crate_anon/nlp_webserver/server_processor.py
@@ -30,11 +30,17 @@
from typing import Dict, Optional, Any
from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser
-from crate_anon.nlp_manager.all_processors import make_nlp_parser_unconfigured
+from crate_anon.nlp_manager.processor_helpers import (
+ make_nlp_parser_unconfigured,
+)
from crate_anon.nlprp.api import JsonObjectType, NlprpServerProcessor
from crate_anon.nlprp.constants import NlprpKeys, NlprpValues
from crate_anon.nlprp.errors import BAD_REQUEST, mkerror, no_such_proc_error
-from crate_anon.nlp_webserver.constants import PROCTYPE_GATE, GATE_BASE_URL
+from crate_anon.nlp_webserver.constants import (
+ KEY_PROCTYPE,
+ PROCTYPE_GATE,
+ GATE_BASE_URL,
+)
class ServerProcessor(NlprpServerProcessor):
@@ -93,9 +99,38 @@ def __init__(
# Add instance to list of processors
ServerProcessor.processors[self.processor_id] = self
+ @classmethod
+ def from_nlprp_json_dict(
+ cls, processor_dict: Dict[str, Any]
+ ) -> NlprpServerProcessor:
+ pd = processor_dict # shorthand
+ return ServerProcessor(
+ name=pd[NlprpKeys.NAME],
+ title=pd[NlprpKeys.TITLE],
+ version=pd[NlprpKeys.VERSION],
+ is_default_version=pd[NlprpKeys.IS_DEFAULT_VERSION],
+ description=pd[NlprpKeys.DESCRIPTION],
+ proctype=pd.get(KEY_PROCTYPE), # may be None
+ schema_type=pd[NlprpKeys.SCHEMA_TYPE], # 'unknown' or 'tabular'
+ sql_dialect=pd.get(NlprpKeys.SQL_DIALECT),
+ tabular_schema=pd.get(NlprpKeys.TABULAR_SCHEMA),
+ ) # also registers with the ServerProcessor class
+
+ @classmethod
+ def debug_remove_processor(cls, name: str, version: str) -> None:
+ """
+ For debugging purposes (testing). De-registers a processor.
+ """
+ processor_id = cls._mk_processor_id(name, version)
+ cls.processors.pop(processor_id, None) # delete if present
+
+ @classmethod
+ def _mk_processor_id(cls, name: str, version: str) -> str:
+ return f"{name}_{version}"
+
@property
def processor_id(self) -> str:
- return f"{self.name}_{self.version}"
+ return self._mk_processor_id(self.name, self.version)
@classmethod
def get_processor(cls, name: str, version: str = "") -> "ServerProcessor":
diff --git a/crate_anon/nlp_webserver/tasks.py b/crate_anon/nlp_webserver/tasks.py
index b53e2cd5..62a91d68 100644
--- a/crate_anon/nlp_webserver/tasks.py
+++ b/crate_anon/nlp_webserver/tasks.py
@@ -27,6 +27,7 @@
"""
+from collections import defaultdict
import datetime
import json
import logging
@@ -66,6 +67,18 @@
log = logging.getLogger(__name__)
+# =============================================================================
+# Constants
+# =============================================================================
+
+# Use the more explicit NLPRP dictionary (JSON object) result format, rather
+# than the simpler list (JSON array) format?
+USE_DICT_FORMAT_NLPRP_RESULT = True
+
+# Following on from that:
+EMPTY_PROCESSOR_RESULT = {} if USE_DICT_FORMAT_NLPRP_RESULT else []
+
+
# =============================================================================
# SQLAlchemy and Celery setup
# =============================================================================
@@ -122,7 +135,7 @@ def nlprp_processor_dict(
processor:
a :class:`crate_anon.nlp_webserver.procs.Processor`, or ``None``
results:
- a JSON array of results
+ a JSON array (or dict) of results
errcode:
(if not ``success``) an integer error code
errmsg:
@@ -131,7 +144,10 @@ def nlprp_processor_dict(
Returns:
a JSON object in NLPRP format
"""
- proc_dict = {NlprpKeys.RESULTS: results or [], NlprpKeys.SUCCESS: success}
+ proc_dict = {
+ NlprpKeys.RESULTS: results or EMPTY_PROCESSOR_RESULT,
+ NlprpKeys.SUCCESS: success,
+ }
if processor:
proc_dict[NlprpKeys.NAME] = processor.name
proc_dict[NlprpKeys.TITLE] = processor.title
@@ -394,6 +410,12 @@ def process_nlp_internal(
"""
Send text to a chosen CRATE Python NLP processor and return a
:class:`NlpServerResult`.
+
+ Args:
+ text:
+ The text to process.
+ processor:
+ The NLP processor to use.
"""
parser = processor.parser
try:
@@ -402,8 +424,26 @@ def process_nlp_internal(
return internal_error(
f"parser is not a CRATE Python NLP parser; is {parser!r}"
)
- # Get second element of each element in parsed text as first is tablename
- # which will have no meaning here
- return nlprp_processor_dict(
- True, processor, results=[x[1] for x in tablename_valuedict_generator]
- )
+ if USE_DICT_FORMAT_NLPRP_RESULT:
+ results = defaultdict(list)
+ for tablename, tableresult in tablename_valuedict_generator:
+ results[tablename].append(tableresult)
+ # If we use defaultdict(list) here, unmodified, the default JSON
+ # serialiser messes up and products things like {'results':
+ # defaultdict(, {'alcoholunits': [{'variable_name': ...
+ # The code above is probably quicker as an iterator, but we can convert
+ # here:
+ results = dict(results)
+ else:
+ # Get second element of each element in parsed text as first is
+ # tablename which will have no meaning here
+ results = []
+ tables_seen = set()
+ for tablename, tableresult in tablename_valuedict_generator:
+ results.append(tableresult)
+ tables_seen.add(tablename)
+ assert len(tables_seen) < 2, (
+ "Internal bug: can't return results from multiple tables (within "
+ "a single NLP processor) in single-table list format"
+ )
+ return nlprp_processor_dict(True, processor, results=results)
diff --git a/crate_anon/nlp_webserver/views.py b/crate_anon/nlp_webserver/views.py
index 106cd474..d7e6554e 100644
--- a/crate_anon/nlp_webserver/views.py
+++ b/crate_anon/nlp_webserver/views.py
@@ -67,7 +67,7 @@
)
from crate_anon.nlprp.constants import (
NlprpCommands,
- NlprpKeys as NKeys,
+ NlprpKeys,
NlprpValues,
)
from crate_anon.nlprp.errors import (
@@ -190,7 +190,7 @@ def __init__(self, nlprp_request: JsonObjectType) -> None:
# The processors being requested. We fetch all of them now, so they
# can be iterated through fast for each document.
requested_processors = json_get_array(
- args, NKeys.PROCESSORS, required=True
+ args, NlprpKeys.PROCESSORS, required=True
)
self.processors = [
ServerProcessor.get_processor_nlprp(d)
@@ -198,18 +198,18 @@ def __init__(self, nlprp_request: JsonObjectType) -> None:
]
# Queue?
- self.queue = json_get_bool(args, NKeys.QUEUE, default=False)
+ self.queue = json_get_bool(args, NlprpKeys.QUEUE, default=False)
# Client job ID
self.client_job_id = json_get_str(
- args, NKeys.CLIENT_JOB_ID, default=""
+ args, NlprpKeys.CLIENT_JOB_ID, default=""
)
# Include the source text in the reply?
- self.include_text = json_get_bool(args, NKeys.INCLUDE_TEXT)
+ self.include_text = json_get_bool(args, NlprpKeys.INCLUDE_TEXT)
# Content: list of objects (each with text and metadata)
- self.content = json_get_array(args, NKeys.CONTENT, required=True)
+ self.content = json_get_array(args, NlprpKeys.CONTENT, required=True)
def processor_ids(self) -> List[str]:
"""
@@ -236,9 +236,9 @@ def gen_text_metadataobj(
tuple: ``(text, metadata)``, as above
"""
for document in self.content:
- text = json_get_str(document, NKeys.TEXT, required=True)
+ text = json_get_str(document, NlprpKeys.TEXT, required=True)
metadata = json_get_value(
- document, NKeys.METADATA, default=None, required=False
+ document, NlprpKeys.METADATA, default=None, required=False
)
yield text, metadata
@@ -252,16 +252,16 @@ def gen_text_metadatastr(self) -> Generator[Tuple[str, str], None, None]:
"""
try:
for document in self.content:
- text = json_get_str(document, NKeys.TEXT, required=True)
+ text = json_get_str(document, NlprpKeys.TEXT, required=True)
metadata = json_get_value(
- document, NKeys.METADATA, default=None, required=False
+ document, NlprpKeys.METADATA, default=None, required=False
)
metadata_str = json.dumps(
metadata, separators=JSON_SEPARATORS_COMPACT
)
yield text, metadata_str
except KeyError:
- raise key_missing_error(key=NKeys.TEXT)
+ raise key_missing_error(key=NlprpKeys.TEXT)
# =============================================================================
@@ -321,14 +321,14 @@ def create_response(
# Put status in HTTP header
self.set_http_response_status(status)
response_dict = {
- NKeys.STATUS: status,
- NKeys.PROTOCOL: {
- NKeys.NAME: NlprpValues.NLPRP_PROTOCOL_NAME,
- NKeys.VERSION: NLPRP_VERSION_STRING,
+ NlprpKeys.STATUS: status,
+ NlprpKeys.PROTOCOL: {
+ NlprpKeys.NAME: NlprpValues.NLPRP_PROTOCOL_NAME,
+ NlprpKeys.VERSION: NLPRP_VERSION_STRING,
},
- NKeys.SERVER_INFO: {
- NKeys.NAME: SERVER_NAME,
- NKeys.VERSION: SERVER_VERSION,
+ NlprpKeys.SERVER_INFO: {
+ NlprpKeys.NAME: SERVER_NAME,
+ NlprpKeys.VERSION: SERVER_VERSION,
},
}
if extra_info is not None:
@@ -344,11 +344,11 @@ def create_error_response(self, error: NlprpError) -> JsonObjectType:
# Turned 'errors' into array
# Should this allow for multiple errors?
error_info = {
- NKeys.ERRORS: [
+ NlprpKeys.ERRORS: [
{
- NKeys.CODE: error.code,
- NKeys.MESSAGE: error.message,
- NKeys.DESCRIPTION: error.description,
+ NlprpKeys.CODE: error.code,
+ NlprpKeys.MESSAGE: error.message,
+ NlprpKeys.DESCRIPTION: error.description,
}
]
}
@@ -450,7 +450,7 @@ def handle_nlprp_request(self) -> JsonObjectType:
"""
self._authenticate()
self._set_body_json_from_request()
- command = json_get_str(self.body, NKeys.COMMAND, required=True)
+ command = json_get_str(self.body, NlprpKeys.COMMAND, required=True)
log.debug(
f"NLPRP request received from {self.request.remote_addr}: "
f"username={self.username}, command={command}"
@@ -489,7 +489,7 @@ def list_processors(self) -> JsonObjectType:
return self.create_response(
status=HttpStatus.OK,
extra_info={
- NKeys.PROCESSORS: [
+ NlprpKeys.PROCESSORS: [
proc.infodict
for proc in ServerProcessor.processors.values()
]
@@ -518,23 +518,23 @@ def process_now(
password=self.password,
)
# proc_dict = procresult.nlprp_processor_dict(processor)
- if procresult[NKeys.NAME] is None:
- procresult[NKeys.NAME] = processor.name
- procresult[NKeys.TITLE] = processor.title
- procresult[NKeys.VERSION] = processor.version
+ if procresult[NlprpKeys.NAME] is None:
+ procresult[NlprpKeys.NAME] = processor.name
+ procresult[NlprpKeys.TITLE] = processor.title
+ procresult[NlprpKeys.VERSION] = processor.version
processor_data.append(procresult)
doc_result = {
- NKeys.METADATA: metadata,
- NKeys.PROCESSORS: processor_data,
+ NlprpKeys.METADATA: metadata,
+ NlprpKeys.PROCESSORS: processor_data,
}
if process_request.include_text:
- doc_result[NKeys.TEXT] = text
+ doc_result[NlprpKeys.TEXT] = text
results.append(doc_result)
response_info = {
- NKeys.CLIENT_JOB_ID: process_request.client_job_id,
- NKeys.RESULTS: results,
+ NlprpKeys.CLIENT_JOB_ID: process_request.client_job_id,
+ NlprpKeys.RESULTS: results,
}
return self.create_response(
status=HttpStatus.OK, extra_info=response_info
@@ -601,7 +601,7 @@ def put_in_queue(
task_id=dpr_id, # for Celery
)
- response_info = {NKeys.QUEUE_ID: queue_id}
+ response_info = {NlprpKeys.QUEUE_ID: queue_id}
return self.create_response(
status=HttpStatus.ACCEPTED, extra_info=response_info
)
@@ -615,7 +615,7 @@ def fetch_from_queue(self) -> JsonObjectType:
# Args
# ---------------------------------------------------------------------
args = json_get_toplevel_args(self.body)
- queue_id = json_get_str(args, NKeys.QUEUE_ID, required=True)
+ queue_id = json_get_str(args, NlprpKeys.QUEUE_ID, required=True)
# ---------------------------------------------------------------------
# Start with the DocProcRequests, because if some are still busy,
@@ -642,8 +642,8 @@ def fetch_from_queue(self) -> JsonObjectType:
return self.create_response(
HttpStatus.ACCEPTED,
{
- NKeys.N_DOCPROCS: n,
- NKeys.N_DOCPROCS_COMPLETED: n_done,
+ NlprpKeys.N_DOCPROCS: n,
+ NlprpKeys.N_DOCPROCS_COMPLETED: n_done,
},
)
@@ -682,19 +682,19 @@ def get_processor_cached(_processor_id: str) -> ServerProcessor:
# ... data for *all* the processors for this doc
for dpr in doc.docprocrequests:
procresult = json.loads(dpr.results) # type: Dict[str, Any]
- if procresult[NKeys.NAME] is None:
+ if procresult[NlprpKeys.NAME] is None:
processor = get_processor_cached(dpr.processor_id)
- procresult[NKeys.NAME] = processor.name
- procresult[NKeys.TITLE] = processor.title
- procresult[NKeys.VERSION] = processor.version
+ procresult[NlprpKeys.NAME] = processor.name
+ procresult[NlprpKeys.TITLE] = processor.title
+ procresult[NlprpKeys.VERSION] = processor.version
processor_data.append(procresult)
metadata = json.loads(doc.client_metadata)
doc_result = {
- NKeys.METADATA: metadata,
- NKeys.PROCESSORS: processor_data,
+ NlprpKeys.METADATA: metadata,
+ NlprpKeys.PROCESSORS: processor_data,
}
if doc.include_text:
- doc_result[NKeys.TEXT] = doc.doctext
+ doc_result[NlprpKeys.TEXT] = doc.doctext
doc_results.append(doc_result)
# ---------------------------------------------------------------------
@@ -706,10 +706,10 @@ def get_processor_cached(_processor_id: str) -> ServerProcessor:
# ... will also delete the DocProcRequests via a cascade
response_info = {
- NKeys.CLIENT_JOB_ID: (
+ NlprpKeys.CLIENT_JOB_ID: (
client_job_id if client_job_id is not None else ""
),
- NKeys.RESULTS: doc_results,
+ NlprpKeys.RESULTS: doc_results,
}
return self.create_response(
status=HttpStatus.OK, extra_info=response_info
@@ -724,7 +724,7 @@ def show_queue(self) -> JsonObjectType:
args = json_get_toplevel_args(self.body, required=False)
if args:
client_job_id = json_get_str(
- args, NKeys.CLIENT_JOB_ID, default="", required=False
+ args, NlprpKeys.CLIENT_JOB_ID, default="", required=False
)
else:
client_job_id = ""
@@ -764,15 +764,15 @@ def show_queue(self) -> JsonObjectType:
queue_answer.append(
{
- NKeys.QUEUE_ID: queue_id,
- NKeys.CLIENT_JOB_ID: client_job_id,
- NKeys.STATUS: (
+ NlprpKeys.QUEUE_ID: queue_id,
+ NlprpKeys.CLIENT_JOB_ID: client_job_id,
+ NlprpKeys.STATUS: (
NlprpValues.BUSY if busy else NlprpValues.READY
),
- NKeys.DATETIME_SUBMITTED: pendulum_to_nlprp_datetime(
+ NlprpKeys.DATETIME_SUBMITTED: pendulum_to_nlprp_datetime(
dt_submitted, to_utc=True
),
- NKeys.DATETIME_COMPLETED: (
+ NlprpKeys.DATETIME_COMPLETED: (
None
if busy
else pendulum_to_nlprp_datetime(max_time, to_utc=True)
@@ -780,7 +780,7 @@ def show_queue(self) -> JsonObjectType:
}
)
return self.create_response(
- status=HttpStatus.OK, extra_info={NKeys.QUEUE: queue_answer}
+ status=HttpStatus.OK, extra_info={NlprpKeys.QUEUE: queue_answer}
)
def delete_from_queue(self) -> JsonObjectType:
@@ -788,8 +788,8 @@ def delete_from_queue(self) -> JsonObjectType:
Deletes from the queue all entries specified by the client.
"""
args = json_get_toplevel_args(self.body)
- delete_all = json_get_bool(args, NKeys.DELETE_ALL, default=False)
- client_job_ids = json_get_array_of_str(args, NKeys.CLIENT_JOB_IDS)
+ delete_all = json_get_bool(args, NlprpKeys.DELETE_ALL, default=False)
+ client_job_ids = json_get_array_of_str(args, NlprpKeys.CLIENT_JOB_IDS)
# Establish what to cancel/delete
q_dpr = (
diff --git a/crate_anon/nlprp/api.py b/crate_anon/nlprp/api.py
index 199ace8c..e3ad3394 100644
--- a/crate_anon/nlprp/api.py
+++ b/crate_anon/nlprp/api.py
@@ -840,3 +840,10 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return auto_repr(self)
+
+ def is_tabular(self) -> bool:
+ """
+ Is the format of the schema information given by the remote processor
+ tabular?
+ """
+ return self.schema_type == NlprpValues.TABULAR
diff --git a/crate_anon/nlprp/nlprp_test_client.py b/crate_anon/nlprp/nlprp_test_client.py
index 3b6aeaa1..bc549569 100755
--- a/crate_anon/nlprp/nlprp_test_client.py
+++ b/crate_anon/nlprp/nlprp_test_client.py
@@ -110,7 +110,7 @@ def fail(msg: str) -> NoReturn:
# "deflate" transfer-encodings from the server; see
# http://docs.python-requests.org/en/master/user/quickstart/.
# - The difference between gzip and deflate:
- # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa
+ # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa: E501
log.debug(
f"Reply has status_code={r.status_code}, headers={r.headers!r}, "
f"text={r.text!r}"
@@ -120,7 +120,7 @@ def fail(msg: str) -> NoReturn:
# "r.text" automatically does gzip decode
except (
ValueError
- ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa
+ ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa: E501
fail("Reply was not JSON")
response = None # for type checker
log.debug(f"Response JSON decoded to: {response.dict!r}")
diff --git a/crate_anon/preprocess/postcodes.py b/crate_anon/preprocess/postcodes.py
index f5841844..f7ee56cc 100755
--- a/crate_anon/preprocess/postcodes.py
+++ b/crate_anon/preprocess/postcodes.py
@@ -61,7 +61,7 @@
- https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography#output-area-oa
-""" # noqa
+""" # noqa: E501
from abc import ABC, ABCMeta, abstractmethod
import argparse
@@ -268,7 +268,7 @@ class ExtendedBase:
and thus define this class to inherit from those two metaclasses, so it can
be the metaclass we want.
- """ # noqa
+ """ # noqa: E501
pass
@@ -671,7 +671,7 @@ class CASWard(Base):
Represents censua area statistics (CAS) wards in the UK, 2003.
- https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography#statistical-wards-cas-wards-and-st-wards
- """ # noqa
+ """ # noqa: E501
__filename__ = "CAS ward names and codes UK as at 01_03.xlsx"
__tablename__ = "cas_ward_2003"
@@ -1107,9 +1107,9 @@ def __init__(self, **kwargs: Any) -> None:
# =============================================================================
# Models: centroids
# =============================================================================
-# https://webarchive.nationalarchives.gov.uk/20160105160709/https://www.ons.gov.uk/ons/guide-method/geography/products/census/spatial/centroids/index.html # noqa
+# https://webarchive.nationalarchives.gov.uk/20160105160709/https://www.ons.gov.uk/ons/guide-method/geography/products/census/spatial/centroids/index.html # noqa: E501
#
-# Looking at lower_layer_super_output_areas_(e+w)_2011_population_weighted_centroids_v2.zip : # noqa
+# Looking at lower_layer_super_output_areas_(e+w)_2011_population_weighted_centroids_v2.zip : # noqa: E501
# - LSOA_2011_EW_PWC.shp -- probably a Shape file;
# ... yes
# ... https://en.wikipedia.org/wiki/Shapefile
@@ -1124,7 +1124,7 @@ class PopWeightedCentroidsLsoa2011(Base):
That is, the geographical centre of the LSOA, weighted by population. (A
first approximation: imagine every person pulling on the centroid
simultaneously and with equal force from their home. Where will it end up?)
- """ # noqa
+ """
__filename__ = "LSOA_2011_EW_PWC_COORD_V2.CSV"
__tablename__ = "pop_weighted_centroids_lsoa_2011"
# __debug_content__ = True
@@ -1133,7 +1133,7 @@ class PopWeightedCentroidsLsoa2011(Base):
lsoa_name = Column(String(NAME_LEN))
bng_north = Column(Integer, comment="British National Grid, North (m)")
bng_east = Column(Integer, comment="British National Grid, East (m)")
- # https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid#All-numeric_grid_references # noqa
+ # https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid#All-numeric_grid_references # noqa: E501
latitude = Column(Numeric(precision=13, scale=10),
comment="Latitude (degrees, 10dp)")
longitude = Column(Numeric(precision=13, scale=10),
diff --git a/crate_anon/preprocess/preprocess_pcmis.py b/crate_anon/preprocess/preprocess_pcmis.py
index b5dd02c4..78b6d2b0 100755
--- a/crate_anon/preprocess/preprocess_pcmis.py
+++ b/crate_anon/preprocess/preprocess_pcmis.py
@@ -569,7 +569,7 @@ def get_pcmis_dd_settings(ddhint: DDHint) -> str:
{sk.DDGEN_FORCE_LOWER_CASE} = False
{sk.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = True
-""" # noqa
+""" # noqa: E501
# =============================================================================
diff --git a/crate_anon/preprocess/rio_ddgen.py b/crate_anon/preprocess/rio_ddgen.py
index 7b1d1e9f..6fb240ba 100644
--- a/crate_anon/preprocess/rio_ddgen.py
+++ b/crate_anon/preprocess/rio_ddgen.py
@@ -392,4 +392,4 @@ def get_rio_dd_settings(ddhint: DDHint) -> str:
{sk.DDGEN_FORCE_LOWER_CASE} = False
{sk.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = True
-""" # noqa
+""" # noqa: E501
diff --git a/crate_anon/preprocess/rio_view_func.py b/crate_anon/preprocess/rio_view_func.py
index 7bbec64e..b63e40ac 100644
--- a/crate_anon/preprocess/rio_view_func.py
+++ b/crate_anon/preprocess/rio_view_func.py
@@ -249,7 +249,7 @@ def lookup_from_fragment_first_row_outer_apply(
- https://stackoverflow.com/questions/2043259/sql-server-how-to-join-to-first-row
- https://stackoverflow.com/questions/9275132/real-life-example-when-to-use-outer-cross-apply-in-sql
- """ # noqa
+ """ # noqa: E501
return (
f"OUTER APPLY (SELECT TOP 1 {', '.join(lookup_fields)} "
f"FROM {lookup_table} "
diff --git a/crate_anon/preprocess/systmone_ddgen.py b/crate_anon/preprocess/systmone_ddgen.py
index bf67d219..be9884c4 100644
--- a/crate_anon/preprocess/systmone_ddgen.py
+++ b/crate_anon/preprocess/systmone_ddgen.py
@@ -410,7 +410,7 @@
et al. 2014, https://pubmed.ncbi.nlm.nih.gov/25261970/) perform similar
researcher/data separation via other methods.
-""" # noqa
+""" # noqa: E501
# todo: SystmOne (CRATE traffic-light system): implement S1_ClinicalOutcome_ConsentResearch # noqa
diff --git a/crate_anon/tools/launch_celery.py b/crate_anon/tools/launch_celery.py
index 5264590d..d7f0c6d4 100755
--- a/crate_anon/tools/launch_celery.py
+++ b/crate_anon/tools/launch_celery.py
@@ -92,7 +92,7 @@ def inner_main() -> None:
# Without "--pool=solo", sod all happens: tasks go into the
# Reserved queue, but aren't executed.
# See:
- # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#module-celery.bin.worker # noqa
+ # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#module-celery.bin.worker # noqa: E501
# https://github.com/celery/celery/issues/2146
cmdargs += ["--pool=solo"]
diff --git a/crate_anon/tools/print_crateweb_demo_config.py b/crate_anon/tools/print_crateweb_demo_config.py
index badc271a..0658a687 100755
--- a/crate_anon/tools/print_crateweb_demo_config.py
+++ b/crate_anon/tools/print_crateweb_demo_config.py
@@ -76,21 +76,21 @@
# =============================================================================
# Site URL configuration
# =============================================================================
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
-# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa
-# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa
+# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa: E501
+# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa: E501
DJANGO_SITE_ROOT_ABSOLUTE_URL = "@@django_site_root_absolute_url@@"
FORCE_SCRIPT_NAME = "@@force_script_name@@"
# FORCE_SCRIPT_NAME = "" # example for Apache root hosting
-# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa
+# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa: E501
# =============================================================================
# Site security
# =============================================================================
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = "@@secret_key@@"
@@ -399,12 +399,12 @@ def always_show_toolbar(request: "HttpRequest") -> bool:
EMAIL_USE_SSL = False
# Who will the e-mails appear to come from?
-EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa
+EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501
# -----------------------------------------------------------------------------
# Additional settings
# -----------------------------------------------------------------------------
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
# During development, we route all consent-related e-mails to the developer.
# Switch SAFETY_CATCH_ON to False for production mode.
diff --git a/crate_anon/version.py b/crate_anon/version.py
index 913be140..aa90b4ed 100644
--- a/crate_anon/version.py
+++ b/crate_anon/version.py
@@ -34,10 +34,10 @@
# Constants
# =============================================================================
-CRATE_VERSION = "0.20.5"
+CRATE_VERSION = "0.20.6"
CRATE_VERSION_DATE = "2024-06-26"
-MINIMUM_PYTHON_VERSION = (3, 8)
+MINIMUM_PYTHON_VERSION = (3, 9)
# Only other place that has this: install_virtualenv.py (which can't import
# CRATE packages).
diff --git a/debugging/mssql_without_transaction.py b/debugging/mssql_without_transaction.py
index 32805738..5e423b58 100755
--- a/debugging/mssql_without_transaction.py
+++ b/debugging/mssql_without_transaction.py
@@ -99,7 +99,7 @@ def execute(engine_or_conn: Union[Connection, Engine], sql: str) -> None:
- SQLSetConnectAttr is an ODBC function:
https://docs.microsoft.com/en-us/sql/odbc/reference/syntax/sqlsetconnectattr-function
-""" # noqa
+""" # noqa: E501
print("SQLAlchemy engine again:")
show_query(engine, query_trancount) # 1
diff --git a/docker/dockerfiles/.env b/docker/dockerfiles/.env
index 05455b91..177c8c36 100644
--- a/docker/dockerfiles/.env
+++ b/docker/dockerfiles/.env
@@ -24,7 +24,7 @@ CRATE_DOCKER_GATE_VERSION=9.0.1
# Need to keep in sync with crate_anon/version.py
# Use e.g. -rc1 suffix in development
-CRATE_DOCKER_IMAGE_TAG=crate:0.20.5
+CRATE_DOCKER_IMAGE_TAG=crate:0.20.6-rc3
CRATE_DOCKER_ODBC_USER_CONFIG=odbc_user.ini
diff --git a/docs/recreate_inclusion_files.py b/docs/recreate_inclusion_files.py
index bf590ede..33b691d3 100755
--- a/docs/recreate_inclusion_files.py
+++ b/docs/recreate_inclusion_files.py
@@ -404,7 +404,7 @@ def main():
sudo npm install -g n
sudo n stable
sudo npm install npm@latest -g
-""" # noqa
+""" # noqa: E501
)
raise
diff --git a/docs/source/anonymisation/_crate_anon_check_text_extractor.txt b/docs/source/anonymisation/_crate_anon_check_text_extractor.txt
index 61c1b6e7..38082a67 100644
--- a/docs/source/anonymisation/_crate_anon_check_text_extractor.txt
+++ b/docs/source/anonymisation/_crate_anon_check_text_extractor.txt
@@ -1,7 +1,7 @@
USAGE: crate_anon_check_text_extractor [-h] [checkextractor ...]
Check availability of tools to extract text from different document formats.
-(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.)
+(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.)
POSITIONAL ARGUMENTS:
checkextractor File extensions to check for availability of a text
diff --git a/docs/source/anonymisation/_crate_anon_demo_config_help.txt b/docs/source/anonymisation/_crate_anon_demo_config_help.txt
index 22741d16..296b24b3 100644
--- a/docs/source/anonymisation/_crate_anon_demo_config_help.txt
+++ b/docs/source/anonymisation/_crate_anon_demo_config_help.txt
@@ -1,6 +1,6 @@
USAGE: crate_anon_demo_config [-h] [--output OUTPUT] [--leave_placeholders]
-Print a demo config file for the CRATE anonymiser. (CRATE version 0.20.5,
+Print a demo config file for the CRATE anonymiser. (CRATE version 0.20.6,
2024-06-26. Created by Rudolf Cardinal.)
OPTIONS:
diff --git a/docs/source/anonymisation/_crate_anon_draft_dd.txt b/docs/source/anonymisation/_crate_anon_draft_dd.txt
index dac55775..ece98795 100644
--- a/docs/source/anonymisation/_crate_anon_draft_dd.txt
+++ b/docs/source/anonymisation/_crate_anon_draft_dd.txt
@@ -11,7 +11,7 @@ USAGE: crate_anon_draft_dd [-h] [--config CONFIG] [--verbose] [--incremental]
[--systmone_no_table_info_in_comments]
Draft a data dictionary for the anonymiser, by scanning a source database.
-(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.)
+(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.)
OPTIONS:
-h, --help show this help message and exit
diff --git a/docs/source/anonymisation/_crate_anon_show_counts_help.txt b/docs/source/anonymisation/_crate_anon_show_counts_help.txt
index 1c2f2524..8cb66d00 100644
--- a/docs/source/anonymisation/_crate_anon_show_counts_help.txt
+++ b/docs/source/anonymisation/_crate_anon_show_counts_help.txt
@@ -1,6 +1,6 @@
USAGE: crate_anon_show_counts [-h] [--config CONFIG] [--verbose]
-Print record counts from source/destination databases. (CRATE version 0.20.5,
+Print record counts from source/destination databases. (CRATE version 0.20.6,
2024-06-26. Created by Rudolf Cardinal.)
OPTIONS:
diff --git a/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt b/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt
index c54f9a65..e26a859d 100644
--- a/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt
+++ b/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt
@@ -2,7 +2,7 @@ USAGE: crate_anon_summarize_dd [-h] [--config CONFIG] [--verbose]
[--output OUTPUT]
Summarize a data dictionary for the anonymiser. The resulting
-spreadsheet-style report has one row per source table. (CRATE version 0.20.5,
+spreadsheet-style report has one row per source table. (CRATE version 0.20.6,
2024-06-26. Created by Rudolf Cardinal.)
OPTIONS:
diff --git a/docs/source/anonymisation/_crate_anonymise_help.txt b/docs/source/anonymisation/_crate_anonymise_help.txt
index 89a09bc1..5a2dbfa8 100644
--- a/docs/source/anonymisation/_crate_anonymise_help.txt
+++ b/docs/source/anonymisation/_crate_anonymise_help.txt
@@ -11,7 +11,7 @@ USAGE: crate_anonymise [-h] [--config CONFIG] [--version] [--verbose]
[--reportevery [REPORTEVERY]] [--debugscrubbers]
[--savescrubbers] [--echo]
-Database anonymiser. (CRATE version 0.20.5, 2024-06-26. Created by Rudolf
+Database anonymiser. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf
Cardinal.)
OPTIONS:
diff --git a/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt b/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt
index 032c7fe8..7aa275da 100644
--- a/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt
+++ b/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt
@@ -1,6 +1,6 @@
USAGE: crate_anonymise_multiprocess [-h] [--nproc [NPROC]] [--verbose]
-Runs the CRATE anonymiser in parallel. Version 0.20.5 (2024-06-26). Note that
+Runs the CRATE anonymiser in parallel. Version 0.20.6 (2024-06-26). Note that
all arguments not specified here are passed to the underlying script (see
crate_anonymise --help).
diff --git a/docs/source/anonymisation/_crate_researcher_report_help.txt b/docs/source/anonymisation/_crate_researcher_report_help.txt
index b7af53f9..bac25a69 100644
--- a/docs/source/anonymisation/_crate_researcher_report_help.txt
+++ b/docs/source/anonymisation/_crate_researcher_report_help.txt
@@ -16,7 +16,7 @@ USAGE: crate_researcher_report [-h] [--config CONFIG] [--noconfig]
output
Produce a researcher-oriented PDF report about a destination database.
-(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.)
+(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.)
Note: if wkhtmtopdf reports 'Too many open files', see
- https://stackoverflow.com/q/25355697;
diff --git a/docs/source/anonymisation/_crate_subset_db_help.txt b/docs/source/anonymisation/_crate_subset_db_help.txt
index 72c9175b..51dbd3ad 100644
--- a/docs/source/anonymisation/_crate_subset_db_help.txt
+++ b/docs/source/anonymisation/_crate_subset_db_help.txt
@@ -12,7 +12,7 @@ USAGE: crate_subset_db [-h] --src_db_url SRC_DB_URL --dst_db_url DST_DB_URL
[--exclude_table_filenames [EXCLUDE_TABLE_FILENAMES ...]]
[--verbose] [--echo]
-Create a simple subset of a database, copying one database to another while applying filters. You can filter by a standard column (e.g. one representing patient IDs), taking permitted filter values from the command line, from file(s), and/or from database(s). You can also decide which tables to include/exclude. (CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.)
+Create a simple subset of a database, copying one database to another while applying filters. You can filter by a standard column (e.g. one representing patient IDs), taking permitted filter values from the command line, from file(s), and/or from database(s). You can also decide which tables to include/exclude. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.)
OPTIONS:
-h, --help show this help message and exit
diff --git a/docs/source/anonymisation/_specimen_anonymiser_config.ini b/docs/source/anonymisation/_specimen_anonymiser_config.ini
index 352110af..80f5da6d 100644
--- a/docs/source/anonymisation/_specimen_anonymiser_config.ini
+++ b/docs/source/anonymisation/_specimen_anonymiser_config.ini
@@ -1,5 +1,5 @@
# Configuration file for CRATE anonymiser (crate_anonymise).
-# Version 0.20.5 (2024-06-26).
+# Version 0.20.6 (2024-06-26).
#
# SEE HELP FOR DETAILS.
diff --git a/docs/source/autodoc/nlp_manager/_index.rst b/docs/source/autodoc/nlp_manager/_index.rst
index b9b0bc87..519a5e45 100644
--- a/docs/source/autodoc/nlp_manager/_index.rst
+++ b/docs/source/autodoc/nlp_manager/_index.rst
@@ -57,6 +57,7 @@ crate_anon/nlp_manager
parse_medex.py.rst
parse_substance_misuse.py.rst
prepare_umls_for_bioyodie.py.rst
+ processor_helpers.py.rst
regex_func.py.rst
regex_numbers.py.rst
regex_parser.py.rst
diff --git a/docs/source/autodoc/nlp_manager/processor_helpers.py.rst b/docs/source/autodoc/nlp_manager/processor_helpers.py.rst
new file mode 100644
index 00000000..7439256f
--- /dev/null
+++ b/docs/source/autodoc/nlp_manager/processor_helpers.py.rst
@@ -0,0 +1,27 @@
+.. docs/source/autodoc/nlp_manager/processor_helpers.py.rst
+
+.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
+
+.. Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
+ Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
+ .
+ This file is part of CRATE.
+ .
+ CRATE is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ .
+ CRATE is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ .
+ You should have received a copy of the GNU General Public License
+ along with CRATE. If not, see .
+
+crate_anon.nlp_manager.processor_helpers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: crate_anon.nlp_manager.processor_helpers
+ :members:
diff --git a/docs/source/autodoc_extra/_command_line_index.rst b/docs/source/autodoc_extra/_command_line_index.rst
index 80e74e2d..3a5d15da 100644
--- a/docs/source/autodoc_extra/_command_line_index.rst
+++ b/docs/source/autodoc_extra/_command_line_index.rst
@@ -127,4 +127,4 @@ Index of CRATE commands
:ref:`crate_windows_service `
-(Documentation built with CRATE 0.20.5.)
+(Documentation built with CRATE 0.20.6.)
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 2b61b518..6b695569 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -1692,6 +1692,11 @@ Changes
characters in the source database. Convert any unusual characters to ASCII in
the destination database.
+- Update NLP handler to cope with remote NLPRP servers providing tabular_schema
+ data, and create local tables based on this, if desired. Change default
+ NLPRP server behaviour to use the more explicit format (object/dictionaries
+ by table, not plain arrays/lists of records).
+
To do
-----
diff --git a/docs/source/misc/technical_notes.rst b/docs/source/misc/technical_notes.rst
index 2b67407b..09698435 100644
--- a/docs/source/misc/technical_notes.rst
+++ b/docs/source/misc/technical_notes.rst
@@ -759,7 +759,7 @@ webnotes.txt
- ... or could validate manually with a form, e.g.
form = MyForm(request.GET, extraparam)
using the style at
- https://stackoverflow.com/questions/18769607/django-form-with-customer-parameter-and-validation-not-getting-clean-function # noqa
+ https://stackoverflow.com/questions/18769607/django-form-with-customer-parameter-and-validation-not-getting-clean-function # noqa: E501
2. Query parameters
- can encode using urllib, e.g.
diff --git a/docs/source/nlp/_crate_nlp_help.txt b/docs/source/nlp/_crate_nlp_help.txt
index 4a95e899..786c52b5 100644
--- a/docs/source/nlp/_crate_nlp_help.txt
+++ b/docs/source/nlp/_crate_nlp_help.txt
@@ -9,7 +9,7 @@ USAGE: crate_nlp [-h] [--config CONFIG] [--nlpdef NLPDEF] [-i | -f]
[--count] [--cloud] [--immediate] [--retrieve]
[--cancelrequest] [--cancelall] [--showqueue]
-NLP manager. Version 0.20.5 (2024-06-26). Created by Rudolf Cardinal.
+NLP manager. Version 0.20.6 (2024-06-26). Created by Rudolf Cardinal.
OPTIONS:
-h, --help show this help message and exit
diff --git a/docs/source/nlp/_crate_nlp_multiprocess_help.txt b/docs/source/nlp/_crate_nlp_multiprocess_help.txt
index e4eaaa23..de6d4982 100644
--- a/docs/source/nlp/_crate_nlp_multiprocess_help.txt
+++ b/docs/source/nlp/_crate_nlp_multiprocess_help.txt
@@ -1,7 +1,7 @@
USAGE: crate_nlp_multiprocess [-h] --nlpdef NLPDEF [--nproc [NPROC]]
[--verbose]
-Runs the CRATE NLP manager in parallel. Version 0.20.5 (2024-06-26). Note that
+Runs the CRATE NLP manager in parallel. Version 0.20.6 (2024-06-26). Note that
all arguments not specified here are passed to the underlying script (see
crate_nlp --help).
diff --git a/docs/source/nlp/_nlprp_test_client.py b/docs/source/nlp/_nlprp_test_client.py
index 3b6aeaa1..bc549569 100755
--- a/docs/source/nlp/_nlprp_test_client.py
+++ b/docs/source/nlp/_nlprp_test_client.py
@@ -110,7 +110,7 @@ def fail(msg: str) -> NoReturn:
# "deflate" transfer-encodings from the server; see
# http://docs.python-requests.org/en/master/user/quickstart/.
# - The difference between gzip and deflate:
- # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa
+ # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa: E501
log.debug(
f"Reply has status_code={r.status_code}, headers={r.headers!r}, "
f"text={r.text!r}"
@@ -120,7 +120,7 @@ def fail(msg: str) -> NoReturn:
# "r.text" automatically does gzip decode
except (
ValueError
- ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa
+ ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa: E501
fail("Reply was not JSON")
response = None # for type checker
log.debug(f"Response JSON decoded to: {response.dict!r}")
diff --git a/docs/source/nlp/_specimen_nlp_config_file.ini b/docs/source/nlp/_specimen_nlp_config_file.ini
index fd1888cf..446634c6 100644
--- a/docs/source/nlp/_specimen_nlp_config_file.ini
+++ b/docs/source/nlp/_specimen_nlp_config_file.ini
@@ -1,5 +1,5 @@
# Configuration file for CRATE NLP manager (crate_nlp).
-# Version 0.20.5 (2024-06-26).
+# Version 0.20.6 (2024-06-26).
#
# PLEASE SEE THE HELP at https://crateanon.readthedocs.io/
# Using defaults for Docker environment: False
diff --git a/docs/source/nlp/nlp_config.rst b/docs/source/nlp/nlp_config.rst
index 512b7ea2..99f945a3 100644
--- a/docs/source/nlp/nlp_config.rst
+++ b/docs/source/nlp/nlp_config.rst
@@ -218,7 +218,9 @@ and CRATE would then look for a :ref:`processor definition
``[processor:mygateproc_name_location]``, and expect it to have the information
required for a GATE processor.
-For possible processor types, see ``crate_nlp --listprocessors``.
+For possible processor types, see ``crate_nlp --listprocessors``. These include
+CRATE internal processors (e.g. "Glucose"), external tools run locally (e.g.
+"GATE"), and cloud-based NLP ("Cloud").
progressdb
@@ -456,33 +458,6 @@ Destination database; the name of a :ref:`database definition
` in the config file.
-desttable
-#########
-
-*String.*
-
-**Applicable to: Cloud, MedEx, all CRATE Python processors.**
-
-The name of the table in the destination database in which the results should
-be stored.
-
-
-assume_preferred_unit
-#####################
-
-*Boolean.* Default: True.
-
-**Applicable to: nearly all numerical CRATE Python processors.**
-
-If a unit is not specified, assume that values are in the processor's preferred
-units. (For example, :class:`crate_anon.nlp_manager.parse_biochemistry.Crp`
-will assume mg/L.)
-
-Some override this and are not configurable, however:
-
-- ``AlcoholUnits`` never assumes this.
-
-
.. _nlp_config_processor_desttable:
desttable
@@ -490,12 +465,15 @@ desttable
*String.*
-**Applicable to: Cloud.**
+**Applicable to: MedEx, all CRATE Python processors.**
-Table name in the destination (NLP output) database into which to write results
-from the cloud NLP processor. Use this for single-table processors.
+The name of the table in the destination database in which the results should
+be stored.
-The alternative is :ref:`outputtypemap `.
+**Cloud** and local **GATE** processors may produce output for multiple tables
+(or a single table, but potentially one that you need to help define). For
+these, use :ref:`outputtypemap ` instead.
+This refers to "output" configurations, in which you can define the table(s).
.. _nlp_config_processor_outputtypemap:
@@ -507,19 +485,21 @@ outputtypemap
**Applicable to: GATE, Cloud.**
-For GATE:
+For GATE processors:
What's GATE? See the section on :ref:`GATE NLP `.
- Map GATE '_type' parameters to possible destination tables (in
- case-insensitive fashion). This parameter is follows is a list of pairs,
- one pair per line.
+ This tabular entry maps GATE '_type' parameters to possible destination
+ tables (in case-insensitive fashion). This parameter is follows is a list
+ of pairs, one pair per line.
- The first item of each is the annotation type coming out of the GATE
system.
- The second is the output type section defined in this file (as a separate
- section). Those sections (q.v.) define tables and columns (fields).
+ section). Those sections each define a table with its columns (fields);
+ see :ref:`GATE/cloud output definitions
+ `.
Example:
@@ -531,15 +511,40 @@ For GATE:
This example would take output from GATE labelled with ``_type=Person`` and
send it to output defined in the ``[output:output_person]`` section of the
- config file -- see :ref:`GATE output definitions
- `. Equivalently for the ``Location`` type.
+ config file -- see :ref:`GATE/cloud output definitions
+ `. Equivalently for the ``Location``
+ type.
+
+For cloud processors:
+
+ - The first parameter is the remote server's tablename (see :ref:`NLPRP
+ schema definition `). The second is an output
+ type definition, as above (which may define the table in full, or just
+ name it and leave the definition to the remote processor).
+
+ - Use this method whenever the remote processor may return data for more
+ than one table.
+
+ - If the remote processor will only return results for a single table, and
+ doesn't name it, the FIRST definition in the output type map is used
+ (and the first element of the pair is ignored for this purpose, i.e.
+ you can use any string you want).
+
+
+assume_preferred_unit
+#####################
+
+*Boolean.* Default: True.
-For cloud:
+**Applicable to: nearly all numerical CRATE Python processors.**
- - The alternative is :ref:`desttable `.
+If a unit is not specified, assume that values are in the processor's preferred
+units. (For example, :class:`crate_anon.nlp_manager.parse_biochemistry.Crp`
+will assume mg/L.)
- - If both are present, only :ref:`outputtypemap
- ` will be used.
+Some override this and are not configurable, however:
+
+- ``AlcoholUnits`` never assumes this.
.. _nlp_config_section_gate_progargs:
@@ -739,17 +744,25 @@ same format as the CRATE processors. ``GATE`` refers to GATE remote
processors, which return a standard set of columns.
-.. _nlp_config_section_gate_output:
+.. _nlp_config_section_gate_cloud_output:
-Config file section: GATE output definition
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Config file section: GATE/cloud output definition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These are config file sections named ``[output:XXX]`` where ``XXX`` is the
-name of one of your GATE output types [#outputuserconfig]_.
+name of one of your GATE output types, or cloud remote processor table
+names.[#outputuserconfig]_
+
+For GATE applications, we need this additional information because CRATE
+doesn't automatically know what sort of output they will produce. The tables
+and SPECIFIC output fields for a given GATE processor are defined here.
-This is an additional thing we need for GATE applications, since CRATE doesn't
-automatically know what sort of output they will produce. The tables and
-SPECIFIC output fields for a given GATE processor are defined here.
+For remote cloud processors, this section enables you to rename remote tables
+to something appropriate locally, and add options (like indexing).
+Additionally, CRATE may or may not be told exactly by the remote application
+what tabular structure it is using, but even if the remote application is
+helpfully informative, you wouldn't automatically trust remotely provided table
+names. So this section is still mandatory.
They are referred to by the :ref:`outputtypemap
` parameter (q.v.).
@@ -761,7 +774,7 @@ desttable
*String.*
Table name in the destination (NLP output) database into which to write results
-from the GATE NLP application.
+from the GATE/cloud NLP application.
renames
@@ -769,9 +782,15 @@ renames
*Multiline string.*
-A list of ``from, to`` things to rename from the GATE output en route to the
-database. In each case, the ``from`` item is the name of a GATE output
-annotation. The ``to`` item is the destination field/column name.
+This is an optional "column renaming" section.
+
+For GATE processors: a list of ``from, to`` things to rename from the GATE
+output en route to the database. In each case, the ``from`` item is the name of
+a GATE output annotation. The ``to`` item is the destination field/column name.
+
+Also applicable to cloud processors; you can rename columns in this way. The
+``from`` item is the column name as specified by the remote processor, and the
+``to`` item the local destination column name.
Specify one pair per line. You can can quote, using shlex_ rules.
Case-sensitive.
@@ -807,6 +826,8 @@ null_literals
*Multiline string.*
+**Applicable to: GATE only.**
+
Define values that will be treated as ``NULL`` in SQL. For example, sometimes
GATE provides the string ``null`` for a NULL value; we can convert to a proper
SQL NULL.
@@ -853,6 +874,11 @@ separate the columns. Examples:
gender VARCHAR(7) Gender (e.g. male, female, unknown)
kind VARCHAR(100) Kind of name (e.g. personName, fullName)
+For cloud applications, this is optional. If you specify **any** lines here,
+your table will be created in this way (plus additional universal CRATE NLP
+columns). If you don't specify any, it will be created according to the remote
+table specification (plus additional universal CRATE NLP columns).
+
indexdefs
#########
diff --git a/docs/source/website_config/_specimen_web_config.py b/docs/source/website_config/_specimen_web_config.py
index 23b30d90..3fd48f17 100644
--- a/docs/source/website_config/_specimen_web_config.py
+++ b/docs/source/website_config/_specimen_web_config.py
@@ -35,21 +35,21 @@
# =============================================================================
# Site URL configuration
# =============================================================================
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
-# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa
-# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa
+# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa: E501
+# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa: E501
DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain"
FORCE_SCRIPT_NAME = ""
# FORCE_SCRIPT_NAME = "" # example for Apache root hosting
-# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa
+# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa: E501
# =============================================================================
# Site security
# =============================================================================
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = "aaaaaaaaaaaaaaaaaa CHANGE THIS! aaaaaaaaaaaaaaaaaa"
@@ -358,12 +358,12 @@ def always_show_toolbar(request: "HttpRequest") -> bool:
EMAIL_USE_SSL = False
# Who will the e-mails appear to come from?
-EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa
+EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501
# -----------------------------------------------------------------------------
# Additional settings
# -----------------------------------------------------------------------------
-# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa
+# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501
# During development, we route all consent-related e-mails to the developer.
# Switch SAFETY_CATCH_ON to False for production mode.
diff --git a/docs/source/website_config/web_config_file.rst b/docs/source/website_config/web_config_file.rst
index a3950b8a..ed4b57b4 100644
--- a/docs/source/website_config/web_config_file.rst
+++ b/docs/source/website_config/web_config_file.rst
@@ -704,7 +704,7 @@ https://docs.djangoproject.com/en/1.8/ref/settings/#email-backend. Example:
EMAIL_USE_SSL = False
# Who will the e-mails appear to come from?
- EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa
+ EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501
Then there are some additional custom settings:
diff --git a/tools/make_package.py b/tools/make_package.py
index 5d4982a7..23821424 100755
--- a/tools/make_package.py
+++ b/tools/make_package.py
@@ -347,8 +347,8 @@ def workpath(destpath: str, *args: str) -> str:
f"adduser --system --ingroup {CRATE_GROUP} {CRATE_USER}"
)
# MAKE_USER_COMMAND_2 = f"adduser --ingroup {CRATE_GROUP} {CRATE_USER}"
- # https://lintian.debian.org/tags/maintainer-script-should-not-use-adduser-system-without-home.html # noqa
- # http://unix.stackexchange.com/questions/47880/how-debian-package-should-create-user-accounts # noqa
+ # https://lintian.debian.org/tags/maintainer-script-should-not-use-adduser-system-without-home.html # noqa: E501
+ # http://unix.stackexchange.com/questions/47880/how-debian-package-should-create-user-accounts # noqa: E501
else:
MAKE_USER_COMMAND_1 = "# No need to add user"
MAKE_USER_COMMAND_2 = ""
@@ -514,7 +514,7 @@ def workpath(destpath: str, *args: str) -> str:
f"""
# Not an official new Debian package, so ignore this one.
# If we did want to close a new-package ITP bug:
-# http://www.debian.org/doc/manuals/developers-reference/pkgs.html#upload-bugfix # noqa
+# http://www.debian.org/doc/manuals/developers-reference/pkgs.html#upload-bugfix # noqa: E501
{PACKAGE_FOR_DEB} binary: new-package-should-close-itp-bug
{PACKAGE_FOR_DEB} binary: extra-license-file
{PACKAGE_FOR_DEB} binary: embedded-javascript-library