diff --git a/crate_anon/anonymise/anonregex.py b/crate_anon/anonymise/anonregex.py index 78f99505..830bc7d5 100755 --- a/crate_anon/anonymise/anonregex.py +++ b/crate_anon/anonymise/anonregex.py @@ -527,7 +527,7 @@ def get_uk_postcode_regex_elements( See: - https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes - """ # noqa + """ # noqa: E501 # ------------------------------------------------------------------------- # Old # ------------------------------------------------------------------------- diff --git a/crate_anon/anonymise/config.py b/crate_anon/anonymise/config.py index c5e5a4ce..c19ea4e7 100644 --- a/crate_anon/anonymise/config.py +++ b/crate_anon/anonymise/config.py @@ -456,7 +456,7 @@ def get_word_alternatives(filenames: List[str]) -> List[List[str]]: Returns: a list of lists of equivalent words - """ # noqa + """ # noqa: E501 alternatives = [] # type: List[List[str]] all_words_seen = set() # type: Set[str] for filename in filenames: diff --git a/crate_anon/anonymise/constants.py b/crate_anon/anonymise/constants.py index f3b06798..f7c7117f 100644 --- a/crate_anon/anonymise/constants.py +++ b/crate_anon/anonymise/constants.py @@ -198,7 +198,7 @@ class SrcFlag(StrEnum): # MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html SQLSERVER_MAX_IDENTIFIER_LENGTH = 128 # Microsoft SQL Server: 128 -- -# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa +# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa: E501 # ============================================================================= @@ -862,7 +862,7 @@ class HashConfigKeys: {_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} = {_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} = -""" # noqa +""" # noqa: E501 # For the style: # [source_databases] diff --git a/crate_anon/anonymise/dd.py b/crate_anon/anonymise/dd.py index f6bbfba5..d62b715e 100644 --- a/crate_anon/anonymise/dd.py +++ b/crate_anon/anonymise/dd.py @@ -571,7 +571,7 @@ def tidy_draft(self) -> None: CREATE TABLE junk (intthing INT PRIMARY KEY, text1 LONGTEXT, text2 LONGTEXT); ALTER TABLE junk ADD FULLTEXT INDEX ftidx1 (text1); ALTER TABLE junk ADD FULLTEXT INDEX ftidx2 (text2); -- OK - """ # noqa + """ # noqa: E501 log.info("Tidying/correcting draft data dictionary") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/crate_anon/anonymise/ddr.py b/crate_anon/anonymise/ddr.py index 1ebe2925..edd1dfcc 100644 --- a/crate_anon/anonymise/ddr.py +++ b/crate_anon/anonymise/ddr.py @@ -1045,7 +1045,7 @@ def dest_sqla_column(self) -> Column: kwargs = { "doc": comment, # Python side "comment": comment, # SQL side; supported from SQLAlchemy 1.2: - # https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa + # https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa: E501 } if self.pk: kwargs["primary_key"] = True diff --git a/crate_anon/anonymise/launch_multiprocess_anonymiser.py b/crate_anon/anonymise/launch_multiprocess_anonymiser.py index 6719ad44..33759f31 100755 --- a/crate_anon/anonymise/launch_multiprocess_anonymiser.py +++ b/crate_anon/anonymise/launch_multiprocess_anonymiser.py @@ -41,7 +41,7 @@ - https://stackoverflow.com/questions/23611396/python-execute-cat-subprocess-in-parallel - https://stackoverflow.com/questions/320232/ensuring-subprocesses-are-dead-on-exiting-python-program - https://stackoverflow.com/questions/641420/how-should-i-log-while-using-multiprocessing-in-python -""" # noqa +""" # noqa: E501 import argparse import logging diff --git a/crate_anon/anonymise/models.py b/crate_anon/anonymise/models.py index 32fb2e2c..c580f895 100644 --- a/crate_anon/anonymise/models.py +++ b/crate_anon/anonymise/models.py @@ -35,7 +35,7 @@ To create a SQLAlchemy ORM programmatically: - https://stackoverflow.com/questions/2574105/sqlalchemy-dynamic-mapping/2575016#2575016 -""" # noqa +""" # noqa: E501 import logging import random diff --git a/crate_anon/common/bugfix_flashtext.py b/crate_anon/common/bugfix_flashtext.py index 0c60d11d..f5eee77d 100644 --- a/crate_anon/common/bugfix_flashtext.py +++ b/crate_anon/common/bugfix_flashtext.py @@ -54,13 +54,13 @@ def replace_keywords(self, a_sentence: str) -> str: if not self.case_sensitive: sentence = a_sentence.lower() # by Ihor Bobak: - # some letters can expand in size when lower() is called, therefore we will preprocess # noqa - # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa - # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa + # some letters can expand in size when lower() is called, therefore we will preprocess # noqa: E501 + # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa: E501 + # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa: E501 # then for the string ABCXABC we want to get # ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to - # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa - # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa + # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa: E501 + # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa: E501 orig_sentence = [] for i in range(0, len(a_sentence)): char = a_sentence[i] diff --git a/crate_anon/common/constants.py b/crate_anon/common/constants.py index 705a191c..bbea601a 100644 --- a/crate_anon/common/constants.py +++ b/crate_anon/common/constants.py @@ -52,6 +52,8 @@ # Is this program running on readthedocs.org? ON_READTHEDOCS = os.environ.get("READTHEDOCS") == "True" +NoneType = type(None) # for isinstance, sometimes + # ============================================================================= # Constant creation @@ -111,7 +113,7 @@ class DockerConstants: HOST = "0.0.0.0" # ... not "localhost" or "127.0.0.1"; see - # https://nickjanetakis.com/blog/docker-tip-54-fixing-connection-reset-by-peer-or-similar-errors # noqa + # https://nickjanetakis.com/blog/docker-tip-54-fixing-connection-reset-by-peer-or-similar-errors # noqa: E501 # ============================================================================= diff --git a/crate_anon/common/extendedconfigparser.py b/crate_anon/common/extendedconfigparser.py index a29d53b9..de6e81ee 100644 --- a/crate_anon/common/extendedconfigparser.py +++ b/crate_anon/common/extendedconfigparser.py @@ -30,6 +30,7 @@ import ast import configparser import logging +import os.path from typing import ( Any, Dict, @@ -148,7 +149,7 @@ def __init__(self, *args, case_sensitive: bool = False, **kwargs) -> None: # 'converters': Python 3.5 and up super().__init__(*args, **kwargs) if case_sensitive: - # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser # noqa + # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser # noqa: E501 self.optionxform = str # Use the underlying ConfigParser class for e.g. @@ -520,6 +521,10 @@ def __init__( self.parser = ExtendedConfigParser(case_sensitive=case_sensitive) if filename: log.info(f"Reading config file: {filename}") + if not os.path.isfile(filename): + raise RuntimeError( + f"Config file {filename} does not exist" + ) self.parser.read(filename, encoding=encoding) else: self.parser.read_file(fileobj) diff --git a/crate_anon/common/memsize.py b/crate_anon/common/memsize.py index 129caf82..601f7cc4 100644 --- a/crate_anon/common/memsize.py +++ b/crate_anon/common/memsize.py @@ -27,7 +27,7 @@ From https://stackoverflow.com/questions/449560/how-do-i-determine-the-size-of-an-object-in-python -""" # noqa +""" # noqa: E501 from gc import get_referents from sys import getsizeof @@ -52,7 +52,7 @@ def getsize(obj: Any, assume_none_denied: bool = False) -> int: Skip checks for classes/modules/functions. Assume that all objects should be checked (typically, meaning that the caller guarantees not to pass stuff that doesn't need checking). - """ # noqa + """ # noqa: E501 if not assume_none_denied: if isinstance(obj, DENYLIST): raise TypeError( diff --git a/crate_anon/common/regex_helpers.py b/crate_anon/common/regex_helpers.py index 82d9e94e..3f8df720 100644 --- a/crate_anon/common/regex_helpers.py +++ b/crate_anon/common/regex_helpers.py @@ -91,7 +91,7 @@ _NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$") _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$") -# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa +# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores # noqa: E501 # ============================================================================= diff --git a/crate_anon/common/spreadsheet.py b/crate_anon/common/spreadsheet.py index 99a8dad3..ec473aa4 100644 --- a/crate_anon/common/spreadsheet.py +++ b/crate_anon/common/spreadsheet.py @@ -289,7 +289,7 @@ def write_spreadsheet( ext = filetype or os.path.splitext(filename)[1] if filename == "-" or ext == SpreadsheetFileExtensions.TSV.value: first_key = next(iter(data)) - # https://stackoverflow.com/questions/30362391/how-do-you-find-the-first-key-in-a-dictionary # noqa + # https://stackoverflow.com/questions/30362391/how-do-you-find-the-first-key-in-a-dictionary # noqa: E501 first_sheet = data[first_key] write_tsv(filename, first_sheet) elif ext == SpreadsheetFileExtensions.CSV.value: diff --git a/crate_anon/common/sql.py b/crate_anon/common/sql.py index da651430..afd300db 100644 --- a/crate_anon/common/sql.py +++ b/crate_anon/common/sql.py @@ -151,9 +151,9 @@ MSSQL_COLTYPE_TO_LEN = { # The "N" prefix means Unicode. - # https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa - # https://docs.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver15 # noqa - # https://docs.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver15 # noqa + # https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa: E501 + # https://docs.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver15 # noqa: E501 + # https://docs.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver15 # noqa: E501 "NVARCHAR_MAX": 2**30 - 1, # Can specify NVARCHAR(1) to NVARCHAR(4000), or NVARCHAR(MAX) for 2^30 - 1. "VARCHAR_MAX": 2**31 - 1, @@ -222,7 +222,7 @@ class SchemaId: - https://stackoverflow.com/questions/11618277/difference-between-schema-database-in-mysql - """ # noqa + """ # noqa: E501 def __init__(self, db: str = "", schema: str = "") -> None: """ @@ -2452,7 +2452,7 @@ def translate_sql_qmark_to_percent(sql: str) -> str: :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` objects, so that the visual appearance matches what they expect from their database. - """ # noqa + """ # noqa: E501 # 1. Escape % characters sql = escape_percent_for_python_dbapi(sql) # 2. Replace ? characters that are not within quotes with %s. @@ -2466,3 +2466,29 @@ def translate_sql_qmark_to_percent(sql: str) -> str: else: newsql += c return newsql + + +def decorate_index_name( + idxname: str, tablename: str = None, engine: Engine = None +) -> str: + """ + Amend the name of a database index. Specifically, this is because SQLite + (which we won't use much, but do use for testing!) won't accept two indexes + with the same names applying to different tables. + + Args: + idxname: + The original index name. + tablename: + The name of the table. + engine: + The SQLAlchemy engine, from which we obtain the dialect. + + Returns: + The index name, amended if necessary. + """ + if not tablename or not engine: + return idxname + if engine.dialect.name == "sqlite": + return f"{idxname}_{tablename}" + return idxname diff --git a/crate_anon/crateweb/config/settings.py b/crate_anon/crateweb/config/settings.py index 8f189819..2df0909a 100644 --- a/crate_anon/crateweb/config/settings.py +++ b/crate_anon/crateweb/config/settings.py @@ -58,7 +58,7 @@ UrlNames, ) -# https://stackoverflow.com/questions/2636536/how-to-make-django-work-with-unsupported-mysql-drivers-such-as-gevent-mysql-or-c # noqa +# https://stackoverflow.com/questions/2636536/how-to-make-django-work-with-unsupported-mysql-drivers-such-as-gevent-mysql-or-c # noqa: E501 try: import pymysql @@ -115,10 +115,10 @@ # 'kombu.transport.django', # for Celery with Django database as broker # 'template_profiler_panel', # 'silk', - "crate_anon.crateweb.config.apps.UserProfileAppConfig", # for user-specific settings # noqa - "crate_anon.crateweb.config.apps.ResearchAppConfig", # the research database query app # noqa - "crate_anon.crateweb.config.apps.ConsentAppConfig", # the consent-to-contact app # noqa - "crate_anon.crateweb.config.apps.CoreAppConfig", # for e.g. the runcpserver command # noqa + "crate_anon.crateweb.config.apps.UserProfileAppConfig", # for user-specific settings # noqa: E501 + "crate_anon.crateweb.config.apps.ResearchAppConfig", # the research database query app # noqa: E501 + "crate_anon.crateweb.config.apps.ConsentAppConfig", # the consent-to-contact app # noqa: E501 + "crate_anon.crateweb.config.apps.CoreAppConfig", # for e.g. the runcpserver command # noqa: E501 "crate_anon.crateweb.config.apps.ApiConfig", # for the anonymisation API ) @@ -130,7 +130,7 @@ # ... reinstated here 2017-01-30 (django-debug-toolbar==1.6) # ... "as early as possible... but after any other middle that encodes the # response's content, such as GZipMiddleware" - # ... http://django-debug-toolbar.readthedocs.io/en/1.0/installation.html#explicit-setup # noqa + # ... http://django-debug-toolbar.readthedocs.io/en/1.0/installation.html#explicit-setup # noqa: E501 "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", @@ -140,11 +140,11 @@ "django.middleware.clickjacking.XFrameOptionsMiddleware", "django.middleware.security.SecurityMiddleware", # Additional: - "cardinal_pythonlib.django.middleware.UserBasedExceptionMiddleware", # provide debugging details to superusers # noqa - "cardinal_pythonlib.django.middleware.LoginRequiredMiddleware", # prohibit all pages except login pages if not logged in # noqa - # 'cardinal_pythonlib.django.middleware.DisableClientSideCachingMiddleware', # no client-side caching # noqa - "crate_anon.crateweb.core.middleware.RestrictAdminMiddleware", # non-developers can't access the devadmin site # noqa - # 'cardinal_pythonlib.django.request_cache.RequestCacheMiddleware', # per-request cache, UNTESTED # noqa + "cardinal_pythonlib.django.middleware.UserBasedExceptionMiddleware", # provide debugging details to superusers # noqa: E501 + "cardinal_pythonlib.django.middleware.LoginRequiredMiddleware", # prohibit all pages except login pages if not logged in # noqa: E501 + # 'cardinal_pythonlib.django.middleware.DisableClientSideCachingMiddleware', # no client-side caching # noqa: E501 + "crate_anon.crateweb.core.middleware.RestrictAdminMiddleware", # non-developers can't access the devadmin site # noqa: E501 + # 'cardinal_pythonlib.django.request_cache.RequestCacheMiddleware', # per-request cache, UNTESTED # noqa: E501 ) LOGIN_URL = "/login/" # for LoginRequiredMiddleware @@ -166,7 +166,7 @@ "django.template.context_processors.request", "django.contrib.auth.context_processors.auth", "django.contrib.messages.context_processors.messages", - "crate_anon.crateweb.core.context_processors.common_context", # noqa + "crate_anon.crateweb.core.context_processors.common_context", ], "loaders": [ # https://docs.djangoproject.com/en/1.9/ref/templates/api/ @@ -206,9 +206,9 @@ "debug_toolbar.panels.logging.LoggingPanel", "debug_toolbar.panels.redirects.RedirectsPanel", # Built in but not enabled as standard: - # 'debug_toolbar.panels.profiling.ProfilingPanel', # EXTREME DANGER! Breaks middleware inc. LoginRequiredMiddleware! # noqa + # 'debug_toolbar.panels.profiling.ProfilingPanel', # EXTREME DANGER! Breaks middleware inc. LoginRequiredMiddleware! # noqa: E501 # Extra: - # 'template_profiler_panel.panels.template.TemplateProfilerPanel', # removed 2017-01-31; division by zero error # noqa + # 'template_profiler_panel.panels.template.TemplateProfilerPanel', # removed 2017-01-31; division by zero error # noqa: E501 ] @@ -223,7 +223,7 @@ CELERY_TASK_SERIALIZER = "json" # Results are OPTIONAL. The CRATE web service doesn't use them. # But may be helpful for Celery testing. -# See http://docs.celeryproject.org/en/latest/configuration.html#std:setting-CELERY_RESULT_BACKEND # noqa +# See http://docs.celeryproject.org/en/latest/configuration.html#std:setting-CELERY_RESULT_BACKEND # noqa: E501 CELERY_RESULT_BACKEND = "rpc://" # uses AMQP CELERY_RESULT_PERSISTENT = False diff --git a/crate_anon/crateweb/consent/lookup_rio.py b/crate_anon/crateweb/consent/lookup_rio.py index eb816871..de26d95c 100644 --- a/crate_anon/crateweb/consent/lookup_rio.py +++ b/crate_anon/crateweb/consent/lookup_rio.py @@ -977,7 +977,7 @@ def get_latest_consent_mode_from_rio_generic( po.NHSNumber = %s -- string comparison ORDER BY cr.AssessmentDate DESC - """ # noqa + """ # noqa: E501 # BEWARE "%s" IN SQL COMMENTS! The database backend will crash because # the number of substituted parameters will be wrong. # New as of 2018-06-28: @@ -999,7 +999,7 @@ def get_latest_consent_mode_from_rio_generic( cr.NHSNumber = %s -- string comparison ORDER BY cr.AssessmentDate DESC - """ # noqa + """ # noqa: E501 else: assert False, "Internal bug" # makes type checker happy diff --git a/crate_anon/crateweb/consent/tasks.py b/crate_anon/crateweb/consent/tasks.py index 6e17203e..8aaa5f9a 100644 --- a/crate_anon/crateweb/consent/tasks.py +++ b/crate_anon/crateweb/consent/tasks.py @@ -97,7 +97,7 @@ Requires Django 1.9. As of 2015-11-21, that means 1.9rc1 -""" # noqa +""" # noqa: E501 import logging from celery import shared_task @@ -195,7 +195,7 @@ def finalize_clinician_response(clinician_response_id: int) -> None: Args: clinician_response_id: PK of the clinician response - """ # noqa + """ # noqa: E501 from crate_anon.crateweb.consent.models import ( ClinicianResponse, ) # delayed import diff --git a/crate_anon/crateweb/core/management/commands/runcpserver.py b/crate_anon/crateweb/core/management/commands/runcpserver.py index ba620c3e..89002089 100755 --- a/crate_anon/crateweb/core/management/commands/runcpserver.py +++ b/crate_anon/crateweb/core/management/commands/runcpserver.py @@ -42,7 +42,7 @@ ./manage.py runcpserver --port 8080 --ssl_certificate /etc/ssl/certs/ssl-cert-snakeoil.pem --ssl_private_key /etc/ssl/private/ssl-cert-snakeoil.key -""" # noqa +""" # noqa: E501 from argparse import ArgumentParser, Namespace import logging diff --git a/crate_anon/crateweb/core/utils.py b/crate_anon/crateweb/core/utils.py index bb146bc1..3d92e177 100644 --- a/crate_anon/crateweb/core/utils.py +++ b/crate_anon/crateweb/core/utils.py @@ -237,7 +237,7 @@ def site_absolute_url(path: str) -> str: But that does at least mean we can use the same method for static and Django URLs. - """ # noqa + """ # noqa: E501 url = settings.DJANGO_SITE_ROOT_ABSOLUTE_URL + path log.debug(f"site_absolute_url: {path} -> {url}") return url @@ -467,7 +467,7 @@ class JavascriptTree(JavascriptTreeNode): print(t.js_str_html()) print(t.js_data()) - """ # noqa + """ # noqa: E501 def __init__( self, diff --git a/crate_anon/crateweb/research/html_functions.py b/crate_anon/crateweb/research/html_functions.py index a6a80bce..4199332a 100644 --- a/crate_anon/crateweb/research/html_functions.py +++ b/crate_anon/crateweb/research/html_functions.py @@ -93,7 +93,7 @@ def visibility_button( {title_html} - """ # noqa + """ # noqa: E501 def visibility_contentdiv( diff --git a/crate_anon/crateweb/research/migrations/0007_sitewidequery.py b/crate_anon/crateweb/research/migrations/0007_sitewidequery.py index 04d9f377..b407d921 100644 --- a/crate_anon/crateweb/research/migrations/0007_sitewidequery.py +++ b/crate_anon/crateweb/research/migrations/0007_sitewidequery.py @@ -53,20 +53,20 @@ class Migration(migrations.Migration): "64-bit non-cryptographic hash of SQL query" ) ), - ), # nopep8 + ), ( "args", JsonClassField( null=True, verbose_name="SQL arguments (as JSON)" ), - ), # nopep8 + ), ( "raw", models.BooleanField( default=False, verbose_name="SQL is raw, not parameter-substituted", ), - ), # nopep8 + ), ( "qmark", models.BooleanField( @@ -76,7 +76,7 @@ class Migration(migrations.Migration): " placeholders" ), ), - ), # nopep8 + ), ("created", models.DateTimeField(auto_now_add=True)), ( "deleted", @@ -87,13 +87,13 @@ class Migration(migrations.Migration): " queries are never properly deleted." ), ), - ), # nopep8 + ), ( "description", models.TextField( default="", verbose_name="query description" ), - ), # nopep8 + ), ], options={ "abstract": False, diff --git a/crate_anon/crateweb/research/migrations/0008_query_display.py b/crate_anon/crateweb/research/migrations/0008_query_display.py index 2de287e4..64faf09c 100644 --- a/crate_anon/crateweb/research/migrations/0008_query_display.py +++ b/crate_anon/crateweb/research/migrations/0008_query_display.py @@ -46,6 +46,6 @@ class Migration(migrations.Migration): field=models.TextField( default="[]", verbose_name="Subset of output columns to be displayed", - ), # nopep8 + ), ), ] diff --git a/crate_anon/crateweb/research/migrations/0009_query_no_null.py b/crate_anon/crateweb/research/migrations/0009_query_no_null.py index b48487d3..3eab05ec 100644 --- a/crate_anon/crateweb/research/migrations/0009_query_no_null.py +++ b/crate_anon/crateweb/research/migrations/0009_query_no_null.py @@ -16,6 +16,6 @@ class Migration(migrations.Migration): field=models.BooleanField( default=False, verbose_name="Omit Null columns for this query when displayed", - ), # nopep8 + ), ), ] diff --git a/crate_anon/crateweb/research/models.py b/crate_anon/crateweb/research/models.py index fbb76d18..95187f47 100644 --- a/crate_anon/crateweb/research/models.py +++ b/crate_anon/crateweb/research/models.py @@ -171,7 +171,7 @@ def hack_django_pyodbc_azure_cursorwrapper() -> None: SettingsKeys.DISABLE_DJANGO_PYODBC_AZURE_CURSOR_FETCHONE_NEXTSET, True, ): - # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa + # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa: E501 hack_django_pyodbc_azure_cursorwrapper() @@ -278,7 +278,7 @@ def get_executed_researchdb_cursor( from crate_anon.crateweb.research.models import * c = get_executed_researchdb_cursor("SELECT 1") - """ # noqa + """ # noqa: E501 args = args or [] cursor = connections[ RESEARCH_DB_CONNECTION_NAME @@ -372,7 +372,7 @@ def gen_excel_row_elements( converting to ``str`` in the encoding of the worksheet's choice.) """ # Docstring must be a raw string for Sphinx! See - # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa + # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa: E501 for element in row: if isinstance(element, bytes): # Convert to str using the worksheet's encoding. @@ -621,7 +621,7 @@ def save(self, *args, update_fields=None, **kwargs) -> None: Custom save method. Ensures that only one :class:`Query` has ``active == True`` for a given user. Also sets the hash. """ - # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa + # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa: E501 if self.active: Query.objects.filter(user=self.user, active=True).update( active=False @@ -1252,7 +1252,7 @@ def __str__(self) -> str: # # https://docs.djangoproject.com/en/1.8/topics/db/multi-db/ # # https://newcircle.com/s/post/1242/django_multiple_database_support # # noinspection PyMethodMayBeStatic,PyUnusedLocal -# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa +# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa: E501 # """ # read model PidLookup -> look at database secret # """ @@ -1269,7 +1269,7 @@ def __str__(self) -> str: # # 2017-02-12, to address bug: # # - https://code.djangoproject.com/ticket/27054 # # See also: -# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa +# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa: E501 # return db == 'default' @@ -1311,7 +1311,7 @@ class Meta: managed = False db_table = PatientInfoConstants.SECRET_MAP_TABLENAME - # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa + # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa: E501 def save(self, *args, **kwargs) -> None: return @@ -1535,7 +1535,7 @@ class PatientMultiQuery: - ... within - """ # noqa + """ # noqa: E501 def __init__( self, @@ -1690,7 +1690,7 @@ def set_override_query(self, query: str) -> None: AND anonymous_output.patient.nhshash IS NOT NULL ORDER BY _mrid - """ # noqa + """ # noqa: E501 self._manual_patient_id_query = query def _get_select_mrid_column(self) -> Optional[ColumnId]: diff --git a/crate_anon/crateweb/research/research_db_info.py b/crate_anon/crateweb/research/research_db_info.py index 1455e952..849ddd57 100644 --- a/crate_anon/crateweb/research/research_db_info.py +++ b/crate_anon/crateweb/research/research_db_info.py @@ -554,7 +554,7 @@ def _schema_query_microsoft( - ``sys.fulltext_catalogs``: https://msdn.microsoft.com/en-us/library/ms188779.aspx - ``sys.fulltext_index_columns``: https://msdn.microsoft.com/en-us/library/ms188335.aspx - """ # noqa + """ # noqa: E501 if not schema_names: raise ValueError( "No schema_names specified (for SQL Server " "database)" diff --git a/crate_anon/crateweb/research/views.py b/crate_anon/crateweb/research/views.py index 937aed34..0035d401 100644 --- a/crate_anon/crateweb/research/views.py +++ b/crate_anon/crateweb/research/views.py @@ -156,6 +156,16 @@ get_patients_per_page, UserProfile, ) +from crate_anon.nlp_manager.constants import ( + # Fieldnames for CRATE NLP table + FN_NLPDEF, + FN_SRCDB, + FN_SRCFIELD, + FN_SRCPKFIELD, + FN_SRCPKSTR, + FN_SRCPKVAL, + FN_SRCTABLE, +) log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -173,16 +183,6 @@ MPID_PREFIX = "~mpid" -# Fieldnames for CRATE NLP table -FN_NLPDEF = "_nlpdef" -FN_SRCDB = "_srcdb" -FN_SRCTABLE = "_srctable" -FN_SRCFIELD = "_srcfield" -FN_SRCPKFIELD = "_srcpkfield" -FN_SRCPKVAL = "_srcpkval" -FN_SRCPKSTR = "_srcpkstr" - - # ============================================================================= # Helper functions # ============================================================================= @@ -1596,7 +1596,7 @@ def resultset_html_table( Returns: str: HTML - """ # noqa + """ # noqa: E501 # Considered but not implemented: hiding table columns # ... see esp "tr > *:nth-child(n)" at # https://stackoverflow.com/questions/5440657/how-to-hide-columns-in-html-table # noqa @@ -1729,7 +1729,7 @@ def single_record_html_table( Returns: str: HTML - """ # noqa + """ # noqa: E501 table_html = "" if FN_NLPDEF in fieldnames: srcdb_ind = srctable_ind = srcfield_ind = None @@ -2410,7 +2410,7 @@ def render_lookup( Returns: a :class:`django.http.response.HttpResponse` - """ # noqa + """ # noqa: E501 # if not request.user.superuser: # return HttpResponse('Forbidden', status=403) # # https://stackoverflow.com/questions/3297048/403-forbidden-vs-401-unauthorized-http-responses # noqa @@ -2896,7 +2896,7 @@ def common_find_text( Returns: a :class:`django.http.response.HttpResponse` - """ # noqa + """ # noqa: E501 # When you forget about Django forms, go back to: # http://www.slideshare.net/pydanny/advanced-django-forms-usage diff --git a/crate_anon/linkage/comparison.py b/crate_anon/linkage/comparison.py index 26f44d09..c321ba40 100644 --- a/crate_anon/linkage/comparison.py +++ b/crate_anon/linkage/comparison.py @@ -272,7 +272,7 @@ def __init__( :math:`P(D | \neg H) = P(\text{match given different person}) = p_f`. If no match: :math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`. - """ # noqa + """ # noqa: E501 super().__init__() self.match = match self.p_match_given_same_person = p_match_given_same_person diff --git a/crate_anon/linkage/constants.py b/crate_anon/linkage/constants.py index 2f084388..ff44d358 100644 --- a/crate_anon/linkage/constants.py +++ b/crate_anon/linkage/constants.py @@ -70,7 +70,7 @@ def _mk_dictstr(x: Dict[str, float]) -> str: THIS_DIR = os.path.abspath(os.path.dirname(__file__)) UK_MEAN_OA_POPULATION_2011 = 309 # not used any more! Left here for interest. -# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa +# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa: E501 UK_POPULATION_2017 = 66040000 # 2017 figure, 66.04m CAMBS_PBORO_POPULATION_2018 = 852523 diff --git a/crate_anon/linkage/helpers.py b/crate_anon/linkage/helpers.py index 42903e05..2e15e57e 100644 --- a/crate_anon/linkage/helpers.py +++ b/crate_anon/linkage/helpers.py @@ -143,7 +143,7 @@ - this allows you to look up *from* output area *to* postcode sector, implying that postcode sectors must be larger. -""" # noqa +""" # noqa: E501 # ============================================================================= @@ -179,7 +179,7 @@ >>> dmeta("Clérambault") # raises UnicodeEncodeError -""" # noqa +""" # noqa: E501 # ============================================================================= @@ -501,7 +501,7 @@ def get_postcode_sector( # (*) [2] uses "Z99 3CZ" (page 6); [1, 3] use "ZZ99 3CZ". )) PSEUDO_POSTCODE_SECTORS = set(get_postcode_sector(p) for p in PSEUDO_POSTCODES) -""" # noqa +""" # noqa: E501 PSEUDO_POSTCODE_START = "ZZ99" PSEUDOPOSTCODE_NFA = "ZZ993VZ" # no fixed abode @@ -806,7 +806,7 @@ def mk_blurry_dates(d: Union[Date, str]) -> Tuple[str, str, str]: Returns MONTH_DAY, YEAR_DAY, and YEAR_MONTH versions in a standard form. """ # ISO format is %Y-%m-%d; see - # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa + # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa: E501 # Here we want the shortest full representation; these are not intended to # be human-legible. d = coerce_to_pendulum_date(d) diff --git a/crate_anon/linkage/matchconfig.py b/crate_anon/linkage/matchconfig.py index a68f761a..62d6c2d9 100644 --- a/crate_anon/linkage/matchconfig.py +++ b/crate_anon/linkage/matchconfig.py @@ -678,7 +678,7 @@ def mk_p_c_dict( simplify(second_stage_speedup) # = b + 647 / 16 - """ # noqa + """ # noqa: E501 if verbose: log.debug(f"... MatchConfig built. Settings: {self}") diff --git a/crate_anon/linkage/people.py b/crate_anon/linkage/people.py index 3ca8f4cb..2cd9a509 100644 --- a/crate_anon/linkage/people.py +++ b/crate_anon/linkage/people.py @@ -287,7 +287,7 @@ def get_unique_match_detailed(self, proband: Person) -> MatchResult: # 2020-04-25: Do this in one pass. # A bit like - # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa + # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa: E501 # ... but modified, as that fails to deal with joint winners # ... and it's not a super algorithm anyway. diff --git a/crate_anon/linkage/validation/entropy_names.py b/crate_anon/linkage/validation/entropy_names.py index 11a979f8..3e21046a 100755 --- a/crate_anon/linkage/validation/entropy_names.py +++ b/crate_anon/linkage/validation/entropy_names.py @@ -126,7 +126,7 @@ .. code-block:: none inf(A | B) = -log2[P(A | B)] [8], from [5] - + = -log2{ P(A) * pev(A, B) } from [4a] = -{ log2[P(A)] + log2[pev(A, B)] } = -log2[P(A)] - log2[pev(A, B)] @@ -219,15 +219,15 @@ the other. The "iev" concept above is about pairs of individual events. For two discrete RVs, - - I(X; Y) = sum_y{ sum_x{ P_XY(x, y) log[ P_XY(x, y) / (P_X(x) * P_Y(y)) ] }} + + I(X; Y) = sum_y{ sum_x{ P_XY(x, y) log[ P_XY(x, y) / (P_X(x) * P_Y(y)) ] }} - Mutual information is a consideration across events. The individual-event - version is "pointwise mutual information", + version is "pointwise mutual information", https://en.wikipedia.org/wiki/Pointwise_mutual_information, which is - + .. code-block:: none - + pmi(x; y) = log[ P(x, y) / (P(x) * P(y) ] = log[ P(x | y) / P(x) ] = log[ P(y | x) / P(y) ] @@ -251,7 +251,7 @@ .. code-block:: none - ln(x) = log2(x) * ln(2) + ln(x) = log2(x) * ln(2) log2(x) = ln(x) * log2(e) A partial match would provide a log likelihood of @@ -279,7 +279,7 @@ Code largely abandoned; not re-checked since NameFrequencyInfo was refactored, since this code had served it purpose. -""" # noqa +""" # noqa: E501 # ============================================================================= @@ -986,7 +986,7 @@ def show_partial_match_frequencies() -> None: P(share metaphone, not first two char or name) = 0.0014685292425002856 P(share first two char, not metaphone or name) = 0.014894287604562073 -""" # noqa +""" # noqa: E501 # ============================================================================= diff --git a/crate_anon/linkage/validation/validate_fuzzy_linkage.py b/crate_anon/linkage/validation/validate_fuzzy_linkage.py index adc4937e..45c36e6e 100755 --- a/crate_anon/linkage/validation/validate_fuzzy_linkage.py +++ b/crate_anon/linkage/validation/validate_fuzzy_linkage.py @@ -1090,7 +1090,7 @@ def validate_2_fetch_cdl( -- Final count: 152888 (on 2022-05-26). -- Compare: SELECT COUNT(*) FROM rawCRSCDL.dbo.[CRS_Output_2020 09 21] = 162874 - """ # noqa + """ # noqa: E501 ) engine = create_engine(url, echo=echo) result = engine.execute(sql) # type: CursorResult @@ -1344,7 +1344,7 @@ def validate_2_fetch_pcmis( -- Final count: 93347 (on 2022-05-26). -- Compare: SELECT COUNT(*) FROM rawPCMIS.dbo.PatientDetails = 94344. - """ # noqa + """ # noqa: E501 ) engine = create_engine(url, echo=echo) result = engine.execute(sql) # type: CursorResult @@ -1775,7 +1775,7 @@ def validate_2_fetch_rio( -- 2022-06-16) after removal of NNNStatus clause. -- Compare: SELECT COUNT(*) FROM RiO62CAMLive.dbo.ClientIndex = 216739 -- Compare: SELECT COUNT(*) FROM RiO62CAMLive.dbo.Client = 216739 - """ # noqa + """ # noqa: E501 ) engine = create_engine(url, echo=echo) result = engine.execute(sql) # type: CursorResult @@ -2072,7 +2072,7 @@ def validate_2_fetch_systmone( -- Final count: 613175 (2022-06-01). -- Compare: SELECT COUNT(*) FROM SystmOne.dbo.S1_Patient = 619062. - """ # noqa + """ # noqa: E501 ) engine = create_engine(url, echo=echo) result = engine.execute(sql) # type: CursorResult @@ -2385,7 +2385,7 @@ def help_v2_compare(plaintext: bool) -> str: set {EnvVar.COMMON_OPTIONS}=--{Switches.POPULATION_SIZE} {CAMBS_POPULATION} --{Switches.EXTRA_VALIDATION_OUTPUT} cd "%{EnvVar.DATADIR}%" {help_v2_compare(plaintext=False)} -""" # noqa +""" # noqa: E501 # Skipped: {help_v2_compare(plaintext=True)} diff --git a/crate_anon/nlp_manager/all_processors.py b/crate_anon/nlp_manager/all_processors.py index f202e3f9..0c9f9863 100644 --- a/crate_anon/nlp_manager/all_processors.py +++ b/crate_anon/nlp_manager/all_processors.py @@ -67,6 +67,7 @@ from crate_anon.nlp_manager.parse_substance_misuse import ( ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS, ) +from crate_anon.nlp_webserver.server_processor import ServerProcessor ClassType = Type[object] @@ -119,12 +120,16 @@ def all_local_parser_classes() -> List[Type[BaseNlpParser]]: Return all classes that are non-abstract subclasses of :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`. + ... but not test parsers. + Checks that they all have unique names in lower case. """ # noinspection PyTypeChecker classes = get_all_subclasses( BaseNlpParser ) # type: List[Type[BaseNlpParser]] + classes = [cls for cls in classes if not cls.is_test_nlp_parser] + lower_case_short_names = set() # type: Set[str] lower_case_full_names = set() # type: Set[str] for cls in classes: @@ -209,40 +214,36 @@ def make_nlp_parser( raise ValueError(f"Unknown NLP processor type: {classname!r}") -def make_nlp_parser_unconfigured( - classname: str, raise_if_absent: bool = True -) -> Optional[TableMaker]: +def possible_local_processor_names() -> List[str]: """ - Get a debugging (unconfigured) instance of an NLP parser. - - Args: - classname: the name of the NLP parser class - raise_if_absent: raise ``ValueError`` if there is no match? - - Returns: - the class, or ``None`` if there isn't one with that name - + Returns all NLP processor names that can run locally. """ - cls = get_nlp_parser_class(classname) - if cls: - return cls(nlpdef=None, cfg_processor_name=None) - if raise_if_absent: - raise ValueError(f"Unknown NLP processor type: {classname!r}") - return None + return [cls.classname() for cls in all_local_parser_classes()] -def possible_local_processor_names() -> List[str]: +def all_nlp_processor_classes() -> List[Type[TableMaker]]: """ - Returns all NLP processor names that can run locally. + Returns all NLP processor classes. """ - return [cls.classname() for cls in all_local_parser_classes()] + return all_tablemaker_classes() def possible_processor_names_including_cloud() -> List[str]: """ Returns all NLP processor names. """ - return [cls.classname() for cls in all_tablemaker_classes()] + return [cls.classname() for cls in all_nlp_processor_classes()] + + +def all_local_processor_classes_without_external_tools() -> ( + List[Type[BaseNlpParser]] +): + """ + Returns all NLP processor classes that don't rely on external tools. + """ + return [ + cls for cls in all_local_parser_classes() if not cls.uses_external_tool + ] def possible_local_processor_names_without_external_tools() -> List[str]: @@ -252,8 +253,7 @@ def possible_local_processor_names_without_external_tools() -> List[str]: """ return [ cls.classname() - for cls in all_local_parser_classes() - if not cls.uses_external_tool + for cls in all_local_processor_classes_without_external_tools() ] @@ -287,10 +287,33 @@ def all_crate_python_processors_nlprp_processor_info( list: list of processor information dictionaries """ allprocs = [] # type: JsonArrayType - for cls in all_local_parser_classes(): + for cls in all_local_processor_classes_without_external_tools(): instance = cls(None, None) proc_info = instance.nlprp_processor_info(sql_dialect=sql_dialect) if extra_dict: proc_info.update(extra_dict) allprocs.append(proc_info) return allprocs + + +def register_all_crate_python_processors_with_serverprocessor( + set_parser: bool = True, +) -> None: + """ + Somewhat ugly. Register all CRATE Python NLP processors with the + ServerProcessor class. + + See also crate_anon/nlp_webserver/procs.py, for a similar thing from JSON. + + Args: + set_parser: + Set up a "free-floating" parser too? + """ + for cls in all_local_processor_classes_without_external_tools(): + instance = cls(None, None) + _proc = instance.nlprp_processor_info() + _x = ServerProcessor.from_nlprp_json_dict(_proc) + # ... registers with the ServerProcessor class + # Doing this here saves time per request + if set_parser: + _x.set_parser() diff --git a/crate_anon/nlp_manager/base_nlp_parser.py b/crate_anon/nlp_manager/base_nlp_parser.py index 6c07208e..a281558a 100644 --- a/crate_anon/nlp_manager/base_nlp_parser.py +++ b/crate_anon/nlp_manager/base_nlp_parser.py @@ -67,6 +67,7 @@ COMMENT, TABLE_KWARGS, ) +from crate_anon.common.sql import decorate_index_name from crate_anon.common.stringfunc import ( compress_docstring, does_text_contain_word_chars, @@ -79,7 +80,7 @@ full_sectionname, NlpConfigPrefixes, ProcessorConfigKeys, - GateFieldNames as GateFN, + GateFieldNames, SqlTypeDbIdentifier, MAX_SQL_FIELD_LEN, ) @@ -198,7 +199,7 @@ def fully_qualified_classname(cls) -> str: Returns the class's fully qualified name. """ # This may be imperfect; see - # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa + # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa: E501 # https://www.python.org/dev/peps/pep-3155/ return ".".join([cls.__module__, cls.__qualname__]) @@ -227,6 +228,9 @@ def dest_tables_indexes(self) -> Dict[str, List[Index]]: Describes indexes that this NLP processor suggests for its destination table(s). + It is perfectly legitimate for the list not to include some tables, or + indeed to be empty. + Returns: dict: a dictionary of ``{tablename: indexes}``, where ``indexes`` is a list of SQLAlchemy :class:`Index` objects. @@ -400,39 +404,50 @@ def _standard_gate_columns() -> List[Column]: """ return [ Column( - GateFN.SET, SqlTypeDbIdentifier, comment="GATE output set name" + GateFieldNames.SET, + SqlTypeDbIdentifier, + comment="GATE output set name", ), Column( - GateFN.TYPE, + GateFieldNames.TYPE, SqlTypeDbIdentifier, comment="GATE annotation type name", ), Column( - GateFN.ID, + GateFieldNames.ID, Integer, comment="GATE annotation ID (not clear this is very useful)", ), Column( - GateFN.STARTPOS, + GateFieldNames.STARTPOS, Integer, comment="Start position in the content", ), Column( - GateFN.ENDPOS, Integer, comment="End position in the content" + GateFieldNames.ENDPOS, + Integer, + comment="End position in the content", ), Column( - GateFN.CONTENT, + GateFieldNames.CONTENT, Text, comment="Full content marked as relevant.", ), ] - @staticmethod - def _standard_gate_indexes() -> List[Index]: + def _standard_gate_indexes(self, dest_tablename: str) -> List[Index]: """ Returns standard indexes for GATE output. """ - return [Index("_idx__set", GateFN.SET, mysql_length=MAX_SQL_FIELD_LEN)] + return [ + Index( + decorate_index_name( + "_idx__set", dest_tablename, self.dest_engine + ), + GateFieldNames.SET, + mysql_length=MAX_SQL_FIELD_LEN, + ) + ] @lru_cache(maxsize=None) def tables(self) -> Dict[str, Table]: @@ -476,7 +491,9 @@ def tables(self) -> Dict[str, Table]: copyindexes_list = [i.get_copy_indexes() for i in ifconfigs] self._assert_index_lists_identical(copyindexes_list) copy_indexes = copyindexes_list[0] - core_indexes = InputFieldConfig.get_core_indexes_for_dest() + core_indexes = InputFieldConfig.get_core_indexes_for_dest( + tablename=tablename, engine=self._destdb.engine + ) column_like_things = ( copy_of_cols + core_indexes + extra_dest_indexes + copy_indexes @@ -514,9 +531,12 @@ def get_table(self, tablename: str) -> Table: try: return tables[tablename] except KeyError: + all_tablenames = list(tables.keys()) raise KeyError( - f"No destination table for this NLP processor " - f"named {tablename!r}" + f"For this NLP processor ({self._cfg_processor_name!r}), the " + f"destination table named {tablename!r} does not have an " + f"associated Table object. Known Table objects are " + f"named {all_tablenames}" ) def make_tables(self, drop_first: bool = False) -> List[str]: @@ -568,7 +588,7 @@ def delete_dest_record( If you don't do this, we will get deadlocks in incremental mode. See e.g. https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html - """ # noqa + """ # noqa: E501 session = self.dest_session srcdb = ifconfig.srcdb srctable = ifconfig.srctable @@ -685,6 +705,7 @@ class BaseNlpParser(TableMaker): """ uses_external_tool = False # may be overridden + is_test_nlp_parser = False # may be overridden by tests! def __init__( self, @@ -855,7 +876,7 @@ def describe_sqla_col( # dialect = MSDialect() column_type = column.type.compile(dialect) data_type = column_type.partition("(")[0] - # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa + # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa: E501 return { NlprpKeys.COLUMN_NAME: column.name, NlprpKeys.COLUMN_TYPE: column_type, diff --git a/crate_anon/nlp_manager/build_medex_itself.py b/crate_anon/nlp_manager/build_medex_itself.py index ab4c226c..3460e1e1 100755 --- a/crate_anon/nlp_manager/build_medex_itself.py +++ b/crate_anon/nlp_manager/build_medex_itself.py @@ -591,7 +591,7 @@ def main() -> None: (certainly the &/&& one!) or else they wouldn't print a stack trace and chug on. - """ # noqa + """ # noqa: E501 for bf in bugfixes: filename = bf["filename"] diff --git a/crate_anon/nlp_manager/cloud_parser.py b/crate_anon/nlp_manager/cloud_parser.py index 6473e49c..ca677b96 100644 --- a/crate_anon/nlp_manager/cloud_parser.py +++ b/crate_anon/nlp_manager/cloud_parser.py @@ -25,12 +25,10 @@ Send text to a cloud-based NLPRP server for processing. -.. todo:: cloud_parser: handle new ``tabular_schema`` info from server - """ import logging -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Tuple, Type, Union from cardinal_pythonlib.lists import chunks from sqlalchemy.schema import Column, Index @@ -39,7 +37,7 @@ from crate_anon.nlp_manager.nlp_definition import NlpDefinition from crate_anon.nlp_manager.constants import ProcessorConfigKeys, NlpDefValues from crate_anon.nlp_manager.output_user_config import OutputUserConfig -from crate_anon.nlprp.constants import NlprpKeys as NKeys, NlprpValues +from crate_anon.nlprp.constants import NlprpKeys, NlprpValues from crate_anon.nlp_manager.base_nlp_parser import TableMaker from crate_anon.nlp_webserver.server_processor import ServerProcessor @@ -86,10 +84,9 @@ def __init__( self.sql_dialect = None self.schema = None # type: Optional[Dict[str, Any]] self.available_remotely = False # update later if available - # Output section - bit of repetition from the 'Gate' parser + # Output section self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] - self.tablename = None if not nlpdef and not cfg_processor_name: # Debugging only @@ -107,24 +104,30 @@ def __init__( self.format = self._cfgsection.opt_str( ProcessorConfigKeys.PROCESSOR_FORMAT, required=True ) - # Output section - bit of repetition from the 'Gate' parser typepairs = self._cfgsection.opt_strlist( ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False ) - # If typepairs is empty the following block won't execute for output_type, outputsection in chunks(typepairs, 2): output_type = output_type.lower() c = OutputUserConfig( - nlpdef.parser, outputsection, schema_required=False + config_parser=nlpdef.parser, + cfg_output_name=outputsection, + schema_required=False, ) self._outputtypemap[output_type] = c self._type_to_tablename[output_type] = c.dest_tablename - if output_type == '""': - self.tablename = c.dest_tablename + # Also, ensure the user doesn't specify desttable (would be + # confusing). + if self._cfgsection.opt_str(ProcessorConfigKeys.DESTTABLE): + raise ValueError( + f"For cloud processors, don't specify " + f"{ProcessorConfigKeys.DESTTABLE!r}; table information is " + f"in {ProcessorConfigKeys.OUTPUTTYPEMAP!r}" + ) @staticmethod - def get_coltype_parts(coltype_str: str) -> List[str]: + def get_coltype_parts(coltype_str: str) -> Tuple[str, Union[str, int]]: """ Get root column type and parameter, i.e. for VARCHAR(50) root column type is VARCHAR and parameter is 50. @@ -136,7 +139,7 @@ def get_coltype_parts(coltype_str: str) -> List[str]: else: try: col_str, parameter = parts - except ValueError: + except ValueError: # e.g. "too many values to unpack" log.error(f"Invalid column type in response: {coltype_str}") raise try: @@ -144,17 +147,24 @@ def get_coltype_parts(coltype_str: str) -> List[str]: parameter = int(parameter) except ValueError: pass - return [col_str, parameter] + return col_str, parameter @staticmethod - def str_to_coltype_general(coltype_str: str) -> Type[sqlatypes.TypeEngine]: + def data_type_str_to_coltype( + data_type_str: str, + ) -> Type[sqlatypes.TypeEngine]: """ - Get the sqlalchemy column type class which fits with the column type. + Get the SQLAlchemy column type class which fits with the data type + specified. Currently we IGNORE self.sql_dialect. """ - coltype = getattr(sqlatypes, coltype_str) + coltype = getattr(sqlatypes, data_type_str) # Check if 'coltype' is really an sqlalchemy column type if issubclass(coltype, sqlatypes.TypeEngine): return coltype + raise NotImplementedError( + f"Don't know the SQLAlchemy column type corresponding to " + f"data type: {data_type_str!r}" + ) def is_tabular(self) -> bool: """ @@ -163,27 +173,78 @@ def is_tabular(self) -> bool: """ return self.schema_type == NlprpValues.TABULAR + def get_tabular_schema_tablenames(self) -> List[str]: + """ + Returns the names of the tables in the tabular schema (or an empty list + if we do not have a tabular schema). + """ + if not self.is_tabular(): + return [] + return list(self.schema.keys()) + + def get_local_from_remote_tablename(self, remote_tablename: str) -> str: + """ + When the remote server specifies a table name, we need to map it to + a local database table name. + + Raises KeyError on failure. + """ + try: + return self.get_tablename_from_type(remote_tablename) + except KeyError: + raise KeyError( + "No local table name defined for remote table " + f"{remote_tablename!r}" + ) + + def get_first_local_tablename(self) -> str: + """ + Used in some circumstances when the remote processor doesn't specify + a table. + """ + assert len(self._type_to_tablename) > 0 + return self._type_to_tablename[0] + def get_tablename_from_type(self, output_type: str) -> str: - return self._type_to_tablename[output_type] + """ + For simple remote GATE processors, or cloud processors: for a given + annotation type (GATE) or remote table name (cloud), return the + destination table name. + + Enforces lower-case lookup. + + Will raise KeyError if this fails. + """ + return self._type_to_tablename[output_type.lower()] def get_otconf_from_type(self, output_type: str) -> OutputUserConfig: - return self._outputtypemap[output_type] + """ + For a GATE annotation type, or cloud remote table name, return the + corresponding OutputUserConfig. + + Enforces lower-case lookup. + + Will raise KeyError if this fails. + """ + return self._outputtypemap[output_type.lower()] def _standard_columns_if_gate(self) -> List[Column]: """ Returns standard columns for GATE output if ``self.format`` is GATE. + Returns an empty list otherwise. """ if self.format == NlpDefValues.FORMAT_GATE: return self._standard_gate_columns() else: return [] - def _standard_indexes_if_gate(self) -> List[Index]: + def _standard_indexes_if_gate(self, dest_tablename: str) -> List[Index]: """ Returns standard indexes for GATE output if ``self.format`` is GATE. + Returns an empty list otherwise. """ if self.format == NlpDefValues.FORMAT_GATE: - return self._standard_gate_indexes() + return self._standard_gate_indexes(dest_tablename) else: return [] @@ -198,17 +259,15 @@ def set_procinfo_if_correct( self, remote_processor: ServerProcessor ) -> None: """ - Checks if a processor dictionary, with all the nlprp specified info + Checks if a processor dictionary, with all the NLPLP-specified info a processor should have, belongs to this processor. If it does, then we add the information from the procesor dictionary. """ if self.procname != remote_processor.name: return - # if ((self.procversion is None and - # processor_dict[NKeys.IS_DEFAULT_VERSION]) or - if ( - self.procversion is None and remote_processor.is_default_version - ) or (self.procversion == remote_processor.version): + if (remote_processor.is_default_version and not self.procversion) or ( + self.procversion == remote_processor.version + ): self._set_processor_info(remote_processor) def _set_processor_info(self, remote_processor: ServerProcessor) -> None: @@ -222,122 +281,122 @@ def _set_processor_info(self, remote_processor: ServerProcessor) -> None: self.remote_processor_info = remote_processor # self.name = processor_dict[NKeys.NAME] self.schema_type = remote_processor.schema_type - if self.is_tabular(): + if remote_processor.is_tabular(): self.schema = remote_processor.tabular_schema self.sql_dialect = remote_processor.sql_dialect # Check that, by this stage, we either have a tabular schema from # the processor, or we have user-specified destfields - assert self.is_tabular or all( + assert self.is_tabular() or all( x.destfields for x in self._outputtypemap.values() ), ( "You haven't specified a table structure and the processor hasn't " "provided one." ) - def _str_to_coltype(self, data_type_str: str) -> sqlatypes.TypeEngine: - """ - This is supposed to get column types depending on the sql dialect - used by the server, but it's not implemented yet. + def dest_tables_columns(self) -> Dict[str, List[Column]]: """ - raise NotImplementedError - # if self.sql_dialect == SqlDialects.MSSQL: - # return self._str_to_coltype_mssql(data_type_str) - # elif self.sql_dialect == SqlDialects.MYSQL: - # return self._str_to_coltype_mysql(data_type_str) - # elif self.sql_dialect == SqlDialects.ORACLE: - # return self._str_to_coltype_oracle(data_type_str) - # elif self.sql_dialect == SqlDialects.POSTGRES: - # return self._str_to_coltype_postgres(data_type_str) - # elif self.sql_dialect == SqlDialects.SQLITE: - # return self._str_to_coltype_sqlite(data_type_str) - # else: - # pass + Describes the destination table(s) that this NLP processor wants to + write to. - def _dest_tables_columns_user(self) -> Dict[str, List[Column]]: - tables = {} # type: Dict[str, List[Column]] + Returns: + dict: a dictionary of ``{tablename: destination_columns}``, where + ``destination_columns`` is a list of SQLAlchemy :class:`Column` + objects. - for output_type, otconfig in self._outputtypemap.items(): - tables[ - otconfig.dest_tablename - ] = self._standard_columns_if_gate() + otconfig.get_columns( - self.dest_engine - ) - return tables + If there is an NLPRP remote table specification (tabular_schema + method), we start with that. - def _dest_tables_indexes_user(self) -> Dict[str, List[Index]]: - tables = {} # type: Dict[str, List[Index]] - for output_type, otconfig in self._outputtypemap.items(): - tables[otconfig.dest_tablename] = ( - self._standard_indexes_if_gate() + otconfig.indexes - ) - return tables + Then we add any user-defined tables. If there is both a remote + definition and a local definition, the local definition overrides the + remote definition. If the destination table info has no columns, + however, it is not used for table creation. - def _dest_tables_columns_auto(self) -> Dict[str, List[Column]]: - """ - Gets the destination tables and their columns using the remote - processor information. + There may in principle be other tables too in the local config that are + absent in the remote info (unusual!). """ - tables = {} - for table, columns in self.schema.items(): - column_objects = ( - self._standard_columns_if_gate() - ) # type: List[Column] - if self.tablename: - tablename = self.tablename - else: - tablename = self.get_tablename_from_type(table) - # ... might be empty list - for column in columns: - col_str, parameter = self.get_coltype_parts( - column[NKeys.COLUMN_TYPE] + table_columns = {} # type: Dict[str, List[Column]] + + # 1. NLPRP remote specification. + if self.is_tabular(): + for remote_tablename, columndefs in self.schema.items(): + # We may start with predefined GATE columns (but this might + # return an empty list). We'll then add to it, if additional + # information is provided. + column_objects = [] # type: List[Column] + dest_tname = self.get_local_from_remote_tablename( + remote_tablename ) - data_type_str = column[NKeys.DATA_TYPE] - coltype = self.str_to_coltype_general(data_type_str) - column_objects.append( - Column( - column[NKeys.COLUMN_NAME], - coltype if not parameter else coltype(parameter), - comment=column[NKeys.COLUMN_COMMENT], - nullable=column[NKeys.IS_NULLABLE], + column_renames = self.get_otconf_from_type( + remote_tablename + ).renames + for column_info in columndefs: + colname = column_info[NlprpKeys.COLUMN_NAME] + # Rename (or keep the same if no applicable rename): + colname = column_renames.get(colname, colname) + col_str, parameter = self.get_coltype_parts( + column_info[NlprpKeys.COLUMN_TYPE] + ) + data_type_str = column_info[NlprpKeys.DATA_TYPE] + # We could use col_str or data_type_str here. + coltype = self.data_type_str_to_coltype(data_type_str) + column_objects.append( + Column( + name=colname, + type_=coltype(parameter) if parameter else coltype, + comment=column_info.get(NlprpKeys.COLUMN_COMMENT), + nullable=column_info[NlprpKeys.IS_NULLABLE], + ) + ) + if not column_objects: + raise ValueError( + "Remote error: NLPRP server declares table " + f"{remote_tablename!r} but provides no column " + "information for it." ) + table_columns[dest_tname] = column_objects + + # 2. User specification. + for output_type, otconfig in self._outputtypemap.items(): + if otconfig.destfields: + # The user has specified columns. + table_columns[ + otconfig.dest_tablename + ] = self._standard_columns_if_gate() + otconfig.get_columns( + self.dest_engine ) - tables[tablename] = column_objects - return tables + else: + # The user has noted the existence of the table, but hasn't + # specified columns. + if otconfig.dest_tablename not in table_columns: + raise ValueError( + f"Local table {otconfig.dest_tablename!r} has no " + "remote definition, and no columns are defined for it " + "in the config file either." + ) + # Otherwise: defined remotely, with no local detail; that's OK. + continue - def _dest_tables_indexes_auto(self) -> Dict[str, List[Index]]: - if self.format != NlpDefValues.FORMAT_GATE: - return {} # indexes can't be returned by the server - tables = {} # type: Dict[str, List[Index]] - for table in self.schema: - tables[table] = self._standard_gate_indexes() - return tables + # Done. + return table_columns def dest_tables_indexes(self) -> Dict[str, List[Index]]: - # Docstring in superclass - if self._outputtypemap and all( - x.destfields for x in self._outputtypemap.values() - ): - return self._dest_tables_indexes_user() - elif self.is_tabular(): - return self._dest_tables_indexes_auto() - else: - raise ValueError( - "You haven't specified a table structure and " - "the processor hasn't provided one." - ) + """ + Describes indexes that this NLP processor suggests for its destination + table(s). - def dest_tables_columns(self) -> Dict[str, List[Column]]: - # Docstring in superclass - if self._outputtypemap and all( - x.destfields for x in self._outputtypemap.values() - ): - return self._dest_tables_columns_user() - elif self.is_tabular(): - # Must have processor-defined schema because we already checked - # for it - return self._dest_tables_columns_auto() - else: - raise ValueError( - "You haven't specified a table structure and " - "the processor hasn't provided one." + Returns: + dict: a dictionary of ``{tablename: indexes}``, where ``indexes`` + is a list of SQLAlchemy :class:`Index` objects. + + The NLPRP remote table specification doesn't include indexing. So all + indexing information is from our config file, whether for GATE or + cloud processors. + """ + table_indexes = {} # type: Dict[str, List[Index]] + for output_type, otconfig in self._outputtypemap.items(): + dest_tablename = otconfig.dest_tablename + table_indexes[dest_tablename] = ( + self._standard_indexes_if_gate(dest_tablename) + + otconfig.indexes ) + return table_indexes diff --git a/crate_anon/nlp_manager/cloud_request.py b/crate_anon/nlp_manager/cloud_request.py index 3f75ac90..34928edc 100644 --- a/crate_anon/nlp_manager/cloud_request.py +++ b/crate_anon/nlp_manager/cloud_request.py @@ -55,18 +55,26 @@ from semantic_version import Version from urllib3.exceptions import NewConnectionError -from crate_anon.common.constants import JSON_INDENT, JSON_SEPARATORS_COMPACT +from crate_anon.common.constants import ( + JSON_INDENT, + JSON_SEPARATORS_COMPACT, + NoneType, +) from crate_anon.common.memsize import getsize from crate_anon.common.stringfunc import does_text_contain_word_chars from crate_anon.nlp_manager.cloud_parser import Cloud from crate_anon.nlp_manager.constants import ( FN_NLPDEF, + FN_SRCPKSTR, + FN_SRCPKVAL, FN_WHEN_FETCHED, - full_sectionname, + GateFieldNames, GateResultKeys, NlpConfigPrefixes, NlpDefValues, + full_sectionname, ) +from crate_anon.nlp_manager.models import FN_SRCHASH from crate_anon.nlp_manager.nlp_definition import NlpDefinition from crate_anon.nlprp.api import ( @@ -78,7 +86,7 @@ ) from crate_anon.nlprp.constants import ( NlprpCommands, - NlprpKeys as NKeys, + NlprpKeys, NlprpValues, NlprpVersions, ) @@ -121,12 +129,12 @@ def report_processor_errors(processor_data: Dict[str, Any]) -> None: Should only be called if there has been an error. Reports the error(s) to the log. """ - name = processor_data[NKeys.NAME] - version = processor_data[NKeys.VERSION] + name = processor_data[NlprpKeys.NAME] + version = processor_data[NlprpKeys.VERSION] error_messages = "\n".join( - f"{error[NKeys.CODE]} - {error[NKeys.MESSAGE]}: " - f"{error[NKeys.DESCRIPTION]}" - for error in processor_data[NKeys.ERRORS] + f"{error[NlprpKeys.CODE]} - {error[NlprpKeys.MESSAGE]}: " + f"{error[NlprpKeys.DESCRIPTION]}" + for error in processor_data[NlprpKeys.ERRORS] ) log.error( f"Processor {name!r} (version {version}) failed for this " @@ -134,6 +142,166 @@ def report_processor_errors(processor_data: Dict[str, Any]) -> None: ) +def extract_nlprp_top_level_results(nlp_data: JsonObjectType) -> List: + """ + Checks that the top-level NLP response contains an appropriate "results" + object, or raises KeyError or ValueError. + + Returns the list result, which is a list of results per document. + """ + try: + docresultlist = nlp_data[NlprpKeys.RESULTS] + except KeyError: + raise KeyError( + "Top-level response does not contain key " + f"{NlprpKeys.RESULTS!r}: {nlp_data!r}" + ) + if not isinstance(docresultlist, list): + raise ValueError( + f"{NlprpKeys.RESULTS!r} value is not a list: {docresultlist!r}" + ) + return docresultlist + + +def parse_nlprp_docresult_metadata( + docresult: JsonObjectType, +) -> Tuple[Dict[str, Any], Optional[int], Optional[str], str]: + """ + Check that this NLPRP document result validly contains metadata, and that + metadata contains things we always send. Extract key components. Provide + helpful error message on failure. + + Returns: + tuple (metadata, pkval, pkstr, srchhash) + + """ + try: + metadata = docresult[NlprpKeys.METADATA] + except KeyError: + raise KeyError( + "Document result does not contain key " + f"{NlprpKeys.METADATA!r}: {docresult!r}" + ) + if not isinstance(metadata, dict): + # ... expected type because that's what we sent; see add_text() + raise KeyError(f"Document result metadata is not a dict: {metadata!r}") + + try: + pkval = metadata[FN_SRCPKVAL] + except KeyError: + raise KeyError( + "Document metadata does not contain key " + f"{FN_SRCPKVAL!r}: {metadata!r}" + ) + if not isinstance(pkval, (int, NoneType)): + # ... expected type because that's what we sent; see add_text() + raise KeyError( + f"Document result metadata {FN_SRCPKVAL!r} is not null or int: " + f"{pkval!r}" + ) + + try: + pkstr = metadata[FN_SRCPKSTR] + except KeyError: + raise KeyError( + "Document metadata does not contain key " + f"{FN_SRCPKSTR!r}: {metadata!r}" + ) + if not isinstance(pkstr, (str, NoneType)): + raise KeyError( + f"Document result metadata {FN_SRCPKVAL!r} is not null or str: " + f"{pkstr!r}" + ) + + if pkval is None and pkstr is None: + raise ValueError( + f"In document result, both {FN_SRCPKVAL!r} and " + f"{FN_SRCPKSTR!r} are null" + ) + + try: + srchash = metadata[FN_SRCHASH] + except KeyError: + raise KeyError( + "Document metadata does not contain key " + f"{FN_SRCPKSTR!r}: {metadata!r}" + ) + if not isinstance(srchash, str): + raise KeyError( + f"Document result metadata {FN_SRCPKSTR!r} is not str: " + f"{srchash!r}" + ) + + return metadata, pkval, pkstr, srchash + + +def extract_processor_data_list( + docresult: JsonObjectType, +) -> List[JsonObjectType]: + """ + Check and extract a list of per-processor results from a single-document + NLPRP result. + """ + try: + processor_data_list = docresult[NlprpKeys.PROCESSORS] + except KeyError: + raise KeyError( + "Document result does not contain key " + f"{NlprpKeys.PROCESSORS!r}: {docresult!r}" + ) + if not isinstance(processor_data_list, list): + raise ValueError( + f"Document result's {NlprpKeys.PROCESSORS!r} element is not a " + f"list: {processor_data_list!r}" + ) + return processor_data_list + + +def parse_per_processor_data(processor_data: Dict[str, Any]) -> Tuple: + """ + Return a tuple of mandatory results from NLPRP per-processor data, or raise + KeyError. + """ + if not isinstance(processor_data, dict): + raise ValueError(f"Processor result is not a dict: {processor_data!r}") + + try: + name = processor_data[NlprpKeys.NAME] + except KeyError: + raise KeyError( + "Processor result does not contain key " + f"{NlprpKeys.NAME!r}: {processor_data!r}" + ) + + try: + version = processor_data[NlprpKeys.VERSION] + except KeyError: + raise KeyError( + "Processor result does not contain key " + f"{NlprpKeys.VERSION!r}: {processor_data!r}" + ) + + is_default_version = processor_data.get(NlprpKeys.IS_DEFAULT_VERSION, True) + + try: + success = processor_data[NlprpKeys.SUCCESS] + except KeyError: + raise KeyError( + "Processor result does not contain key " + f"{NlprpKeys.SUCCESS!r}: {processor_data!r}" + ) + + try: + processor_results = processor_data[NlprpKeys.RESULTS] + except KeyError: + raise KeyError( + "Processor result does not contain key " + f"{NlprpKeys.RESULTS!r}: {processor_data!r}" + ) + + return name, version, is_default_version, success, processor_results + + # ============================================================================= # Exceptions # ============================================================================= @@ -370,38 +538,63 @@ def get_remote_processors(self) -> List[ServerProcessor]: """ # Make request list_procs_request = make_nlprp_dict() - list_procs_request[NKeys.COMMAND] = NlprpCommands.LIST_PROCESSORS + list_procs_request[NlprpKeys.COMMAND] = NlprpCommands.LIST_PROCESSORS request_json = to_json_str(list_procs_request) # Send request, get response json_response = self._post_get_json(request_json, may_fail=False) - status = json_get_int(json_response, NKeys.STATUS) + status = json_get_int(json_response, NlprpKeys.STATUS) if not HttpStatus.is_good_answer(status): - errors = json_get_array(json_response, NKeys.ERRORS) + errors = json_get_array(json_response, NlprpKeys.ERRORS) for err in errors: log.error(f"Error received: {err!r}") raise HTTPError(f"Response status was: {status}") processors = [] # type: List[ServerProcessor] - proclist = json_response[NKeys.PROCESSORS] # type: JsonArrayType - for procinfo in proclist: - proc = ServerProcessor( - # Mandatory: - name=procinfo[NKeys.NAME], - title=procinfo[NKeys.TITLE], - version=procinfo[NKeys.VERSION], - is_default_version=procinfo.get( - NKeys.IS_DEFAULT_VERSION, True - ), - description=procinfo[NKeys.DESCRIPTION], - # Optional: - schema_type=procinfo.get( - NKeys.SCHEMA_TYPE, NlprpValues.UNKNOWN - ), - sql_dialect=procinfo.get(NKeys.SQL_DIALECT, ""), - tabular_schema=procinfo.get(NKeys.TABULAR_SCHEMA), + try: + proclist = json_response[ + NlprpKeys.PROCESSORS + ] # type: JsonArrayType + except KeyError: + raise KeyError( + f"Server did not provide key {NlprpKeys.PROCESSORS!r} in its " + f"response: {json_response!r}" ) + if not isinstance(proclist, list): + raise ValueError( + f"Server's value of {NlprpKeys.PROCESSORS!r} is not a list: " + f"{proclist!r}" + ) + for procinfo in proclist: + if not isinstance(procinfo, dict): + raise ValueError( + f"Server's procinfo object not a dict: {procinfo!r}" + ) + # Any of the following may raise KeyError if missing: + try: + proc = ServerProcessor( + # Mandatory: + name=procinfo[NlprpKeys.NAME], + title=procinfo[NlprpKeys.TITLE], + version=procinfo[NlprpKeys.VERSION], + is_default_version=procinfo.get( + NlprpKeys.IS_DEFAULT_VERSION, True + ), + description=procinfo[NlprpKeys.DESCRIPTION], + # Optional: + schema_type=procinfo.get( + NlprpKeys.SCHEMA_TYPE, NlprpValues.UNKNOWN + ), + sql_dialect=procinfo.get(NlprpKeys.SQL_DIALECT, ""), + tabular_schema=procinfo.get(NlprpKeys.TABULAR_SCHEMA), + ) + except KeyError: + log.critical( + "NLPRP server's processor information is missing a " + "required field" + ) + raise processors.append(proc) return processors @@ -449,17 +642,17 @@ def __init__( # Set up processing request self._request_process = make_nlprp_dict() - self._request_process[NKeys.COMMAND] = NlprpCommands.PROCESS - self._request_process[NKeys.ARGS] = { - NKeys.PROCESSORS: [], # type: List[str] - NKeys.QUEUE: True, - NKeys.CLIENT_JOB_ID: self._client_job_id, - NKeys.INCLUDE_TEXT: False, - NKeys.CONTENT: [], # type: List[str] + self._request_process[NlprpKeys.COMMAND] = NlprpCommands.PROCESS + self._request_process[NlprpKeys.ARGS] = { + NlprpKeys.PROCESSORS: [], # type: List[str] + NlprpKeys.QUEUE: True, + NlprpKeys.CLIENT_JOB_ID: self._client_job_id, + NlprpKeys.INCLUDE_TEXT: False, + NlprpKeys.CONTENT: [], # type: List[str] } # Set up fetch_from_queue request self._fetch_request = make_nlprp_dict() - self._fetch_request[NKeys.COMMAND] = NlprpCommands.FETCH_FROM_QUEUE + self._fetch_request[NlprpKeys.COMMAND] = NlprpCommands.FETCH_FROM_QUEUE self.nlp_data = None # type: Optional[JsonObjectType] # ... the JSON response @@ -542,7 +735,7 @@ def compare(x: Any) -> None: It can be quite a big overestimate, so we probably shouldn't chuck out requests just because the Python size looks too big. - """ # noqa + """ # noqa: E501 if not max_length: # None or 0 return False # no maximum; not too long # Fast, apt to overestimate size a bit (as above) @@ -572,10 +765,12 @@ def _add_processor_to_request( procname: name of processor on the server procversion: version of processor on the server """ - info = {NKeys.NAME: procname} + info = {NlprpKeys.NAME: procname} if procversion: - info[NKeys.VERSION] = procversion - self._request_process[NKeys.ARGS][NKeys.PROCESSORS].append(info) + info[NlprpKeys.VERSION] = procversion + self._request_process[NlprpKeys.ARGS][NlprpKeys.PROCESSORS].append( + info + ) def _add_all_processors_to_request(self) -> None: """ @@ -624,10 +819,10 @@ def add_text(self, text: str, metadata: Dict[str, Any]) -> None: if self.number_of_records > self._cloudcfg.max_records_per_request: raise RecordsPerRequestExceeded - new_content = {NKeys.METADATA: metadata, NKeys.TEXT: text} + new_content = {NlprpKeys.METADATA: metadata, NlprpKeys.TEXT: text} # Add all the identifying information. - args = self._request_process[NKeys.ARGS] - content_key = NKeys.CONTENT # local copy for fractional speedup + args = self._request_process[NlprpKeys.ARGS] + content_key = NlprpKeys.CONTENT # local copy for fractional speedup old_content = copy(args[content_key]) args[content_key].append(new_content) max_length = self._cloudcfg.max_content_length @@ -658,25 +853,25 @@ def send_process_request( should the server include the source text in the reply? """ # Don't send off an empty request - if not self._request_process[NKeys.ARGS][NKeys.CONTENT]: + if not self._request_process[NlprpKeys.ARGS][NlprpKeys.CONTENT]: log.warning("Request empty - not sending.") return # Create request if cookies: self.cookies = cookies - self._request_process[NKeys.ARGS][NKeys.QUEUE] = queue - self._request_process[NKeys.ARGS][ - NKeys.INCLUDE_TEXT + self._request_process[NlprpKeys.ARGS][NlprpKeys.QUEUE] = queue + self._request_process[NlprpKeys.ARGS][ + NlprpKeys.INCLUDE_TEXT ] = include_text_in_reply request_json = to_json_str(self._request_process) # Send request; get response json_response = self._post_get_json(request_json) - status = json_response[NKeys.STATUS] + status = json_response[NlprpKeys.STATUS] if queue and status == HttpStatus.ACCEPTED: - self.queue_id = json_response[NKeys.QUEUE_ID] + self.queue_id = json_response[NlprpKeys.QUEUE_ID] self._fetched = False elif (not queue) and status == HttpStatus.OK: self.nlp_data = json_response @@ -711,7 +906,9 @@ def _try_fetch( # Create request if cookies: self.cookies = cookies - self._fetch_request[NKeys.ARGS] = {NKeys.QUEUE_ID: self.queue_id} + self._fetch_request[NlprpKeys.ARGS] = { + NlprpKeys.QUEUE_ID: self.queue_id + } request_json = to_json_str(self._fetch_request) # Send request; get response @@ -732,9 +929,9 @@ def check_if_ready(self, cookies: CookieJar = None) -> bool: json_response = self._try_fetch(cookies) if not json_response: return False - status = json_response[NKeys.STATUS] + status = json_response[NlprpKeys.STATUS] pending_use_202 = ( - Version(json_response[NKeys.VERSION]) + Version(json_response[NlprpKeys.VERSION]) >= NlprpVersions.FETCH_Q_PENDING_RETURNS_202 ) if status == HttpStatus.OK: @@ -765,87 +962,133 @@ def check_if_ready(self, cookies: CookieJar = None) -> bool: # ------------------------------------------------------------------------- @staticmethod - def get_nlp_values_internal( - processor_data: Dict[str, Any], metadata: Dict[str, Any] - ) -> Generator[Tuple[str, Dict[str, Any], str], None, None]: + def gen_nlp_values_generic_single_table( + processor: Cloud, + tablename: str, + rows: List[Dict[str, Any]], + metadata: Dict[str, Any], + column_renames: Dict[str, str] = None, + ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]: """ - Get result values from processed data from a CRATE server-side. + Get result values from processed data, where the results object is a + list of rows (each row in dictionary format), all for a single table, + such as from a remote CRATE server. + + Success should have been pre-verified. Args: - processor_data: - NLPRP results for one processor + processor: + The processor object. + tablename: + The table name to use. + rows: + List of NLPRP results for one processor. Each result represents + a row of a table and is in dictionary format. metadata: The metadata for a particular document - it would have been sent with the document and the server would have sent it back. + column_renames: + Column renames to apply. - Yields ``(output_tablename, formatted_result, processor_name)``. + Yields ``(output_tablename, formatted_result, processor)``. """ - if not processor_data[NKeys.SUCCESS]: - report_processor_errors(processor_data) - return - for result in processor_data[NKeys.RESULTS]: - result.update(metadata) - yield result + column_renames = column_renames or {} + for row in rows: + rename_keys_in_dict(row, column_renames) + row.update(metadata) + yield tablename, row, processor @staticmethod - def get_nlp_values_gate( - processor_data: Dict[str, Any], + def gen_nlp_values_gate( processor: Cloud, + processor_results: List[Dict[str, Any]], metadata: Dict[str, Any], text: str = "", - ) -> Generator[Tuple[Dict[str, Any], Cloud], None, None]: + ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]: """ - Gets result values from processed GATE data which will originally - be in the following format: + Generates row values from processed GATE data. + + Success should have been pre-verified. + + Args: + processor: + The processor object: + processor_results: + A list of dictionaries (originally from JSON), each + representing a row in a table, and each expected to have this + format: + + .. code-block:: none + + { + 'set': set the results belong to (e.g. 'Medication'), + 'type': annotation type, + 'start': start index, + 'end': end index, + 'features': { + a dictionary of features, e.g. having keys 'drug', + 'frequency', etc., with corresponding values + } + } + + metadata: + The metadata for a particular document - it would have been + sent with the document and the server would have sent it back. + text: + The source text itself (optional). + + Yields: + + tuples ``(output_tablename, formatted_result, processor)`` + + Each instance of ``formatted_result`` has this format: .. code-block:: none { - 'set': set the results belong to (e.g. 'Medication'), - 'type': annotation type, - 'start': start index, - 'end': end index, - 'features': {a dictionary of features e.g. 'drug', 'frequency', etc} + GateFieldNames.TYPE: annotation type, + GateFieldNames.SET: set, + GateFieldNames.STARTPOS: start index, + GateFieldNames.ENDPOS: end index, + GateFieldNames.CONTENT: text fragment, + FEATURE1: VALUE1, + FEATURE2: VALUE2, + ... } - - Yields ``(output_tablename, formatted_result, processor_name)``. - """ # noqa - if not processor_data[NKeys.SUCCESS]: - report_processor_errors(processor_data) - return - for result in processor_data[NKeys.RESULTS]: - # Assuming each set of results says what annotation type - # it is (annotation type is stored as lower case) - annottype = result[GateResultKeys.TYPE].lower() - features = result[GateResultKeys.FEATURES] - start = result[GateResultKeys.START] - end = result[GateResultKeys.END] + """ + for row in processor_results: + # Assuming each row says what annotation type it is (annotation + # type is stored as lower case): + annottype = row[GateResultKeys.TYPE].lower() + features = row[GateResultKeys.FEATURES] + start = row[GateResultKeys.START] + end = row[GateResultKeys.END] formatted_result = { - "_type": annottype, - "_set": result[GateResultKeys.SET], - "_start": start, - "_end": end, - "_content": "" if not text else text[start:end], + GateFieldNames.TYPE: annottype, + GateFieldNames.SET: row[GateResultKeys.SET], + GateFieldNames.STARTPOS: start, + GateFieldNames.ENDPOS: end, + GateFieldNames.CONTENT: text[start:end] if text else "", } - formatted_result.update(features) c = processor.get_otconf_from_type(annottype) rename_keys_in_dict(formatted_result, c.renames) set_null_values_in_dict(formatted_result, c.null_literals) formatted_result.update(metadata) tablename = processor.get_tablename_from_type(annottype) - yield tablename, formatted_result + yield tablename, formatted_result, processor - def get_nlp_values( + def gen_nlp_values( self, - ) -> Generator[Tuple[Dict[str, Any], Cloud], None, None]: + ) -> Generator[Tuple[str, Dict[str, Any], Cloud], None, None]: """ Process response data that we have already obtained from the server, generating individual NLP results. Yields: ``(tablename, result, processor)`` for each result. + The ``tablename`` value is the actual destination database table. Raises: :exc:`KeyError` if an unexpected processor turned up in the results @@ -855,21 +1098,24 @@ def get_nlp_values( "Method 'get_nlp_values' must only be called " "after nlp_data is obtained" ) - for result in self.nlp_data[NKeys.RESULTS]: - metadata = result[NKeys.METADATA] # type: Dict[str, Any] - # ... expected type because that's what we sent; see add_text() - text = result.get(NKeys.TEXT) - for processor_data in result[NKeys.PROCESSORS]: # type: Dict + docresultlist = extract_nlprp_top_level_results(self.nlp_data) + for docresult in docresultlist: + metadata, _, _, _ = parse_nlprp_docresult_metadata(docresult) + text = docresult.get(NlprpKeys.TEXT) + processor_data_list = extract_processor_data_list(docresult) + for processor_data in processor_data_list: + # Details of the server processor that has responded: + ( + name, + version, + is_default_version, + success, + processor_results, + ) = parse_per_processor_data(processor_data) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check that the processor was one we asked for. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Details of the server processor that has responded: - name = processor_data[NKeys.NAME] - version = processor_data[NKeys.VERSION] - is_default_version = processor_data.get( - NKeys.IS_DEFAULT_VERSION, True - ) try: # Retrieve the Python object corresponding to the server # processor that has responded: @@ -895,20 +1141,95 @@ def get_nlp_values( raise KeyError(failmsg) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # OK; we're happy with the processor. Process its results. + # OK; we're happy with the processor. Was it happy? + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if not success: + report_processor_errors(processor_data) + return + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # All well. Process the results. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - if processor.format == NlpDefValues.FORMAT_GATE: - for t, r in self.get_nlp_values_gate( - processor_data, processor, metadata, text - ): - yield t, r, processor + # See nlprp.rst, . + if isinstance(processor_results, dict): + # MULTI-TABLE FORMAT. + # This is a dictionary mapping tables to row lists. + if not processor.is_tabular(): + raise RuntimeError( + f"Unsupported: processor {name!r} is returning " + f"multi-table results but hasn't provided a " + f"table schema" + ) + tnames = processor.get_tabular_schema_tablenames() + for remote_tablename, rows in processor_results.items(): + if remote_tablename not in tnames: + raise ValueError( + f"For processor {name!r}, data provided for " + f"table {remote_tablename!r}, but this was " + "not in the schema" + ) + dest_tablename = processor.get_tablename_from_type( + remote_tablename + ) + yield from self.gen_nlp_values_generic_single_table( + processor=processor, + tablename=dest_tablename, + rows=rows, + metadata=metadata, + column_renames=processor.get_otconf_from_type( + remote_tablename + ).renames, + ) + elif isinstance(processor_results, list): + # SINGLE TABLE FORMAT. + # This is a list of rows, where each row should be a + # dictionary mapping column names to values. + if processor.format == NlpDefValues.FORMAT_GATE: + # We have special knowledge of the "traditional" GATE + # format. The sub-function will work out the table + # name(s). + yield from self.gen_nlp_values_gate( + processor=processor, + processor_results=processor_results, + metadata=metadata, + text=text, + ) + else: + # Potentially valid whether or not there is a + # tabular_schema. The results object is a generic list + # of column_name/value dictionaries. + if processor.is_tabular(): + # Only valid here if there is a SINGLE table in + # the tabular_schema. + tnames = processor.get_tabular_schema_tablenames() + if len(tnames) != 1: + raise ValueError( + f"Processor {name!r} returned results in " + "list format, but this is only valid for " + "a single table; its tables are " + f"{tnames!r}" + ) + remote_tablename = tnames[0] + else: + # We use the FIRST defined table name. + remote_tablename = processor.get_first_tablename() + dest_tablename = processor.get_tablename_from_type( + remote_tablename + ) + yield from self.gen_nlp_values_generic_single_table( + processor=processor, + tablename=dest_tablename, + rows=processor_results, + metadata=metadata, + column_renames=processor.get_otconf_from_type( + remote_tablename + ).renames, + ) else: - for res in self.get_nlp_values_internal( - processor_data, metadata - ): - # For non-GATE processors ther will only be one table - # name - yield processor.tablename, res, processor + raise ValueError( + f"For processor {name!r}, bad results format: " + f"{processor_results!r}" + ) # @do_cprofile def process_all(self) -> None: @@ -921,7 +1242,7 @@ def process_all(self) -> None: sessions = [] - for tablename, nlp_values, processor in self.get_nlp_values(): + for tablename, nlp_values, processor in self.gen_nlp_values(): nlp_values[FN_NLPDEF] = nlpname session = processor.dest_session if session not in sessions: @@ -973,12 +1294,12 @@ def show_queue(self) -> Optional[List[Dict[str, Any]]]: request_json = to_json_str(show_request) json_response = self._post_get_json(request_json, may_fail=False) - status = json_response[NKeys.STATUS] + status = json_response[NlprpKeys.STATUS] if status == HttpStatus.OK: try: - queue = json_response[NKeys.QUEUE] + queue = json_response[NlprpKeys.QUEUE] except KeyError: - log.error(f"Response did not contain key {NKeys.QUEUE!r}.") + log.error(f"Response did not contain key {NlprpKeys.QUEUE!r}.") raise return queue else: @@ -991,7 +1312,7 @@ def delete_all_from_queue(self) -> None: """ delete_request = make_nlprp_request( command=NlprpCommands.DELETE_FROM_QUEUE, - command_args={NKeys.DELETE_ALL: True}, + command_args={NlprpKeys.DELETE_ALL: True}, ) request_json = to_json_str(delete_request) response = self._post(request_json, may_fail=False) @@ -1014,7 +1335,7 @@ def delete_from_queue(self, queue_ids: List[str]) -> None: """ delete_request = make_nlprp_request( command=NlprpCommands.DELETE_FROM_QUEUE, - command_args={NKeys.QUEUE_IDS: queue_ids}, + command_args={NlprpKeys.QUEUE_IDS: queue_ids}, ) request_json = to_json_str(delete_request) response = self._post(request_json, may_fail=False) diff --git a/crate_anon/nlp_manager/input_field_config.py b/crate_anon/nlp_manager/input_field_config.py index c00c9c96..01074cd0 100644 --- a/crate_anon/nlp_manager/input_field_config.py +++ b/crate_anon/nlp_manager/input_field_config.py @@ -45,13 +45,22 @@ table_or_view_exists, ) from cardinal_pythonlib.timing import MultiTimerContext, timer -from sqlalchemy import BigInteger, Column, DateTime, Index, String, Table +from sqlalchemy import ( + BigInteger, + Column, + DateTime, + Index, + Integer, + String, + Table, +) from sqlalchemy.engine.base import Engine from sqlalchemy.orm.session import Session from sqlalchemy.sql import and_, column, exists, null, or_, select, table from sqlalchemy.sql.schema import MetaData from crate_anon.common.parallel import is_my_job_by_hash_prehashed +from crate_anon.common.sql import decorate_index_name from crate_anon.common.stringfunc import relevant_for_nlp from crate_anon.nlp_manager.constants import ( FN_CRATE_VERSION_FIELD, @@ -254,7 +263,9 @@ def get_core_columns_for_dest() -> List[Column]: return [ Column( FN_PK, - BigInteger, + # Autoincrement under SQLite needs a trick: + # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html + BigInteger().with_variant(Integer, "sqlite"), primary_key=True, autoincrement=True, comment="Arbitrary primary key (PK) of output record", @@ -322,21 +333,28 @@ def get_core_columns_for_dest() -> List[Column]: ] @staticmethod - def get_core_indexes_for_dest() -> List[Index]: + def get_core_indexes_for_dest( + tablename: str, engine: Engine + ) -> List[Index]: """ Returns the core indexes to be applied to the destination tables. Primarily, these are for columns that refer to the source. + Args: + tablename: + The name of the table to be used in the destination. + engine: + The destination database SQLAlchemy Engine. + Returns: a list of SQLAlchemy :class:`Index` objects See - - https://stackoverflow.com/questions/179085/multiple-indexes-vs-multi-column-indexes - """ # noqa + """ # noqa: E501 return [ Index( - "_idx_srcref", + decorate_index_name("_idx_srcref", tablename, engine), # Remember, order matters; more to less specific # See also BaseNlpParser.delete_dest_record FN_SRCPKVAL, @@ -346,9 +364,12 @@ def get_core_indexes_for_dest() -> List[Index]: FN_SRCDB, FN_SRCPKSTR, ), - Index("_idx_srcdate", FN_SRCDATETIMEVAL), Index( - "_idx_deletion", + decorate_index_name("_idx_srcdate", tablename, engine), + FN_SRCDATETIMEVAL, + ), + Index( + decorate_index_name("_idx_deletion", tablename, engine), # We sometimes delete just using the following; see # BaseNlpParser.delete_where_srcpk_not FN_NLPDEF, diff --git a/crate_anon/nlp_manager/models.py b/crate_anon/nlp_manager/models.py index db920aea..8e56aa0b 100644 --- a/crate_anon/nlp_manager/models.py +++ b/crate_anon/nlp_manager/models.py @@ -29,7 +29,7 @@ from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.schema import Column, Index, MetaData -from sqlalchemy.types import BigInteger, DateTime, String +from sqlalchemy.types import BigInteger, DateTime, Integer, String from crate_anon.anonymise.constants import COMMENT, TABLE_KWARGS from crate_anon.nlp_manager.constants import ( @@ -96,7 +96,8 @@ class NlpRecord(ProgressBase): pk = Column( "pk", - BigInteger, + # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html + BigInteger().with_variant(Integer, "sqlite"), primary_key=True, autoincrement=True, comment="PK of NLP record (no specific use)", diff --git a/crate_anon/nlp_manager/nlp_definition.py b/crate_anon/nlp_manager/nlp_definition.py index 5940a416..643f8e12 100644 --- a/crate_anon/nlp_manager/nlp_definition.py +++ b/crate_anon/nlp_manager/nlp_definition.py @@ -175,7 +175,7 @@ def _make_proclist( hashphrase = "doesnotmatter" if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS" if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES" - inputfields = f"{if_clin_docs}\n" f" {if_prog_notes}" + inputfields = f"{if_clin_docs}\n {if_prog_notes}" truncate_text_at = "32766" my_env = "MY_ENV_SECTION" my_src_db = "SOURCE_DATABASE" @@ -814,7 +814,7 @@ def _make_proclist( {ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp {ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD} -""" # noqa +""" # noqa: E501 # ============================================================================= diff --git a/crate_anon/nlp_manager/nlp_manager.py b/crate_anon/nlp_manager/nlp_manager.py index 723b5341..75378ecf 100755 --- a/crate_anon/nlp_manager/nlp_manager.py +++ b/crate_anon/nlp_manager/nlp_manager.py @@ -58,7 +58,7 @@ mysqld about 18% [from top] nlp_manager.py about 4-5% * 8 [from top] -""" # noqa +""" # noqa: E501 # ============================================================================= # Imports @@ -113,37 +113,36 @@ BaseNlpParser, TextProcessingFailed, ) +from crate_anon.nlp_manager.cloud_parser import Cloud +from crate_anon.nlp_manager.cloud_request import ( + CloudRequest, + CloudRequestListProcessors, + CloudRequestProcess, + CloudRequestQueueManagement, + extract_nlprp_top_level_results, + parse_nlprp_docresult_metadata, +) from crate_anon.nlp_manager.cloud_request_sender import CloudRequestSender +from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo from crate_anon.nlp_manager.constants import ( DEFAULT_REPORT_EVERY_NLP, - MAX_STRING_PK_LENGTH, - NLP_CONFIG_ENV_VAR, - NlpDefConfigKeys, -) -from crate_anon.nlp_manager.input_field_config import ( - InputFieldConfig, FN_SRCDB, - FN_SRCTABLE, + FN_SRCFIELD, FN_SRCPKFIELD, - FN_SRCPKVAL, FN_SRCPKSTR, - FN_SRCFIELD, + FN_SRCPKVAL, + FN_SRCTABLE, + MAX_STRING_PK_LENGTH, + NLP_CONFIG_ENV_VAR, + NlpDefConfigKeys, TRUNCATED_FLAG, ) -from crate_anon.nlp_manager.models import FN_SRCHASH, NlpRecord +from crate_anon.nlp_manager.input_field_config import InputFieldConfig +from crate_anon.nlp_manager.models import NlpRecord from crate_anon.nlp_manager.nlp_definition import ( NlpDefinition, demo_nlp_config, ) -from crate_anon.nlp_manager.cloud_parser import Cloud -from crate_anon.nlp_manager.cloud_request import ( - CloudRequest, - CloudRequestListProcessors, - CloudRequestProcess, - CloudRequestQueueManagement, -) -from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo -from crate_anon.nlprp.constants import NlprpKeys as NKeys from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE # from crate_anon.common.profiling import do_cprofile @@ -750,7 +749,6 @@ def retrieve_nlp_data(crinfo: CloudRunInfo, incremental: bool = False) -> None: ifconfig_cache[if_section] = ifconfig seen_srchashs = [] # type: List[str] cloud_request = CloudRequestProcess(crinfo=crinfo) - # cloud_request.set_mirror_processors(mirror_procs) cloud_request.set_queue_id(queue_id) log.info(f"Atempting to retrieve data from request #{i} ...") i += 1 @@ -761,21 +759,20 @@ def retrieve_nlp_data(crinfo: CloudRunInfo, incremental: bool = False) -> None: if not ready: # If results are not ready for this particular queue_id, put # back in file - # request_data.write(f"{if_section},{queue_id}\n") remaining_data.append(f"{if_section},{queue_id}\n") all_ready = False else: - nlp_data = cloud_request.nlp_data - for result in nlp_data[NKeys.RESULTS]: + docresultlist = extract_nlprp_top_level_results( + cloud_request.nlp_data + ) + for result in docresultlist: # There are records associated with the given queue_id records_exist = True uncommitted_data = True # 'metadata' is just 'other_values' from before - metadata = result[NKeys.METADATA] # type: Dict[str, Any] - # ... expected type because that's what we sent; see add_text() - pkval = metadata[FN_SRCPKVAL] - pkstr = metadata[FN_SRCPKSTR] - srchash = metadata[FN_SRCHASH] + _, pkval, pkstr, srchash = parse_nlprp_docresult_metadata( + result + ) progrec = None if incremental: progrec = ifconfig.get_progress_record(pkval, pkstr) @@ -865,14 +862,17 @@ def process_cloud_now( for cloud_request in cloud_requests: if cloud_request.request_failed: continue + # (a) handle the actual data cloud_request.process_all() - nlp_data = cloud_request.nlp_data - for result in nlp_data[NKeys.RESULTS]: + # (b) handle the progress records + docresultlist = extract_nlprp_top_level_results( + cloud_request.nlp_data + ) + for result in docresultlist: # 'metadata' is just 'other_values' from before - metadata = result[NKeys.METADATA] - pkval = metadata[FN_SRCPKVAL] - pkstr = metadata[FN_SRCPKSTR] - srchash = metadata[FN_SRCHASH] + _, pkval, pkstr, srchash = parse_nlprp_docresult_metadata( + result + ) progrec = None if incremental: # A word of explanation: to get here, the record must @@ -1078,7 +1078,7 @@ def test_nlp_stdin(nlpdef: NlpDefinition) -> None: ) procreq.add_text(text, metadata={}) procreq.send_process_request(queue=False) - results = procreq.nlp_data[NKeys.RESULTS] + results = extract_nlprp_top_level_results(procreq.nlp_data) result_found = True # ... may not really be true, but we have something to show formatted_results = json.dumps(results, indent=JSON_INDENT) diff --git a/crate_anon/nlp_manager/output_user_config.py b/crate_anon/nlp_manager/output_user_config.py index 8340fde1..a05e45ac 100644 --- a/crate_anon/nlp_manager/output_user_config.py +++ b/crate_anon/nlp_manager/output_user_config.py @@ -65,14 +65,15 @@ class OutputUserConfig: """ - Class defining configuration for the output of a given GATE app. + Class defining configuration for the output of a given GATE app, or remote + cloud app. See the documentation for the :ref:`NLP config file `. """ def __init__( self, - parser: ExtendedConfigParser, + config_parser: ExtendedConfigParser, cfg_output_name: str, schema_required: bool = True, ) -> None: @@ -80,7 +81,7 @@ def __init__( Read config from a configparser section. Args: - parser: + config_parser: :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` cfg_output_name: config file section name suffix -- this is the second of the @@ -89,18 +90,17 @@ def __init__( - :ref:`NLP config file ` - :class:`crate_anon.nlp_manager.parse_gate.Gate` - - schema_required: - is it required that the user has specified a schema, i.e. - destfields and a desttable? - Should be true for Gate, False - for Cloud as the remote processors may have their own schema - definition. + schema_required: + is it required that the user has specified a schema, i.e. + destfields and a desttable? - Should be true for Gate, False + for Cloud as the remote processors may have their own schema + definition. """ sectionname = full_sectionname( NlpConfigPrefixes.OUTPUT, cfg_output_name ) - cfg = ConfigSection(section=sectionname, parser=parser) + cfg = ConfigSection(section=sectionname, parser=config_parser) # --------------------------------------------------------------------- # desttable @@ -129,10 +129,10 @@ def __init__( f"section {sectionname!r}; line was {line!r} but should " f"have contained two things" ) - annotation_name = words[0] - field_name = words[1] - ensure_valid_field_name(field_name) - self._renames[annotation_name] = field_name + annotation_or_remote_column_name = words[0] + to_column_name = words[1] + ensure_valid_field_name(to_column_name) + self._renames[annotation_or_remote_column_name] = to_column_name # --------------------------------------------------------------------- # null_literals @@ -158,7 +158,6 @@ def __init__( as_words=False, ) # ... comments will be removed during that process. - # log.critical(dest_field_lines) # If dest_field_lines is empty (as it may be for a Cloud processor) # the following block doesn't execute, so the 'dest' attributed remain # empty @@ -279,7 +278,8 @@ def indexes(self) -> List[Index]: def renames(self) -> Dict[str, str]: """ Return the "rename dictionary": a dictionary mapping GATE annotation - names to fieldnames in the NLP destination table. + names (or cloud remote column names) to local field (column) names in + the NLP destination table. See diff --git a/crate_anon/nlp_manager/parse_clinical.py b/crate_anon/nlp_manager/parse_clinical.py index af931e79..4fb75f30 100644 --- a/crate_anon/nlp_manager/parse_clinical.py +++ b/crate_anon/nlp_manager/parse_clinical.py @@ -642,7 +642,7 @@ class Bp(BaseNlpParser): # Since we produce two variables, SBP and DBP, and we use something a # little more complex than - # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa + # :class:`crate_anon.nlp_manager.regex_parser.NumeratorOutOfDenominatorParser`, # noqa: E501 # we subclass :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser` # directly.) diff --git a/crate_anon/nlp_manager/parse_cognitive.py b/crate_anon/nlp_manager/parse_cognitive.py index 8f11cc4b..db553e06 100644 --- a/crate_anon/nlp_manager/parse_cognitive.py +++ b/crate_anon/nlp_manager/parse_cognitive.py @@ -153,7 +153,7 @@ class Ace(NumeratorOutOfDenominatorParser): ) \b )?+ {WORD_BOUNDARY} ) - """ # noqa + """ # noqa: E501 # ... note the possessive "?+" above; see tests below. def __init__( diff --git a/crate_anon/nlp_manager/parse_gate.py b/crate_anon/nlp_manager/parse_gate.py index 7e887f77..7a5d18b9 100644 --- a/crate_anon/nlp_manager/parse_gate.py +++ b/crate_anon/nlp_manager/parse_gate.py @@ -52,7 +52,7 @@ from crate_anon.nlp_manager.constants import ( MAX_SQL_FIELD_LEN, ProcessorConfigKeys, - GateFieldNames as GateFN, + GateFieldNames, ) from crate_anon.nlp_manager.nlp_definition import ( NlpDefinition, @@ -173,12 +173,23 @@ def __init__( ProcessorConfigKeys.PROGARGS, required=True ) logtag = nlpdef.logtag or "." + # Also, ensure the user doesn't specify desttable (would be + # confusing). + if self._cfgsection.opt_str(ProcessorConfigKeys.DESTTABLE): + raise ValueError( + f"For GATE processors, don't specify " + f"{ProcessorConfigKeys.DESTTABLE!r}; table information is " + f"in {ProcessorConfigKeys.OUTPUTTYPEMAP!r}" + ) self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] for annottype, outputsection in chunks(typepairs, 2): annottype = annottype.lower() - c = OutputUserConfig(nlpdef.parser, outputsection) + c = OutputUserConfig( + config_parser=nlpdef.parser, + cfg_output_name=outputsection, + ) self._outputtypemap[annottype] = c self._type_to_tablename[annottype] = c.dest_tablename @@ -318,7 +329,7 @@ def parse( d = tsv_pairs_to_dict(line) log.debug(f"dictionary received: {d}") try: - annottype = d[GateFN.TYPE].lower() + annottype = d[GateFieldNames.TYPE].lower() except KeyError: raise ValueError("_type information not in data received") if annottype not in self._type_to_tablename: @@ -375,8 +386,9 @@ def dest_tables_indexes(self) -> Dict[str, List[Index]]: # docstring in superclass tables = {} # type: Dict[str, List[Index]] for anottype, otconfig in self._outputtypemap.items(): - tables[otconfig.dest_tablename] = ( - self._standard_gate_indexes() + otconfig.indexes + dest_tablename = otconfig.dest_tablename + tables[dest_tablename] = ( + self._standard_gate_indexes(dest_tablename) + otconfig.indexes ) return tables diff --git a/crate_anon/nlp_manager/parse_haematology.py b/crate_anon/nlp_manager/parse_haematology.py index 5468823c..134c69b7 100644 --- a/crate_anon/nlp_manager/parse_haematology.py +++ b/crate_anon/nlp_manager/parse_haematology.py @@ -96,7 +96,7 @@ class Haemoglobin(SimpleNumericalResultParser): old-style units, 9 g/dL = 90 g/L, but this will be interpreted as 9 g/L. This problem is hard to avoid. - """ # noqa + """ # noqa: E501 HAEMOGLOBIN_BASE = rf""" {WORD_BOUNDARY} (?: Ha?emoglobin | Hb | HGB ) {WORD_BOUNDARY} diff --git a/crate_anon/nlp_manager/parse_medex.py b/crate_anon/nlp_manager/parse_medex.py index 385284f6..bf03a122 100644 --- a/crate_anon/nlp_manager/parse_medex.py +++ b/crate_anon/nlp_manager/parse_medex.py @@ -388,7 +388,7 @@ Therefore, we should have a Java parameter specified in the config file as ``-Dfile.encoding=UTF-8``. -""" # noqa +""" # noqa: E501 import logging import os diff --git a/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py b/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py index 6d06752d..2a17ae2e 100755 --- a/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py +++ b/crate_anon/nlp_manager/prepare_umls_for_bioyodie.py @@ -138,7 +138,7 @@ EXIT_SUCCESS, ) -_ = """ +_ = r""" =============================================================================== My launch command: @@ -339,7 +339,7 @@ - set JVM limit to 10 GB (meaning it uses 13.9 GB) rather than 30 GB (making it thrash, use 36 GB, and get killed). -""" # noqa +""" # noqa: E501 log = logging.getLogger(__name__) @@ -372,7 +372,7 @@ # - FROM /tmp/tmpg1z_uldg/umls_unzipped # - TO /tmp/tmpg1z_uldg/blah # and using "include" rather than "exclude" mode, we got this: -_NOTES_WORKING_CONFIG_FROM_GUI = """ +_NOTES_WORKING_CONFIG_FROM_GUI = r""" # Configuration Properties File #Mon Sep 14 17:45:02 BST 2020 mmsys_output_stream=gov.nih.nlm.umls.mmsys.io.RRFMetamorphoSysOutputStream @@ -413,7 +413,7 @@ gov.nih.nlm.umls.mmsys.io.NLMFileMetamorphoSysInputStream.meta_source_uri=/tmp/tmpg1z_uldg/umls_unzipped gov.nih.nlm.umls.mmsys.filter.SuppressibleFilter.confirm_default_suppressible_sabttys=HPO|OP;HPO|IS;HPO|OET;NEU|ACR;NEU|OP;NEU|IS;FMA|AB;FMA|OP;MDR|AB;CDT|OP;ICD10AE|PS;ICD10|PS;ICD10AMAE|PS;ICD10AM|PS;NCI|OP;NCI_NICHD|OP;ICPC|PS;ICPC|CS;MTHICPC2EAE|AB;ICPC2EENG|AB;ICPC2P|MTH_PT;ICPC2P|OPN;ICPC2P|MTH_OPN;ICPC2P|OP;ICPC2P|MTH_OP;HCPCS|OP;HCDT|OP;HCPT|OP;HCPCS|OM;HCPCS|OAM;GO|OP;GO|MTH_OP;GO|OET;GO|MTH_OET;GO|IS;GO|MTH_IS;PDQ|OP;PDQ|IS;NCBI|AUN;NCBI|UAUN;LNC|OLC;LNC|LO;LNC|MTH_LO;LNC|OOSN;MDR|OL;MDR|MTH_OL;ICD10PCS|HS;ICD10PCS|AB;ICD10AE|HS;ICD10|HS;HL7V3.0|OP;HL7V3.0|ONP;ICD10CM|AB;ICD9CM|AB;MSH|DEV;MSH|DSV;MSH|QAB;MSH|QEV;MSH|QSV;CPT|AB;HCPT|AB;HCPCS|AB;SNMI|PX;SNMI|HX;SNMI|SX;RCD|OP;RCD|IS;RCD|AS;RCD|AB;RCDSA|OP;RCDSY|OP;RCDAE|OP;RCDSA|IS;RCDSY|IS;RCDAE|IS;RCDSA|AB;RCDSY|AB;RCDAE|AB;RCDSA|OA;RCDSY|OA;RCDAE|OA;RCD|OA;RCDAE|AA;RCD|AA;HCPT|OA;HCPT|AM;HCPCS|OA;HCPCS|AM;HCDT|AB;ALT|AB;HCDT|OA;SNOMEDCT_VET|OAP;SNOMEDCT_VET|OP;SNOMEDCT_US|OAP;SNOMEDCT_US|OP;SNOMEDCT_VET|OAF;SNOMEDCT_VET|OF;SNOMEDCT_US|OAF;SNOMEDCT_US|OF;SNOMEDCT_VET|OAS;SNOMEDCT_VET|IS;SNOMEDCT_US|OAS;SNOMEDCT_US|IS;SNOMEDCT_US|MTH_OAP;SNOMEDCT_US|MTH_OP;SNOMEDCT_US|MTH_OAF;SNOMEDCT_US|MTH_OF;SNOMEDCT_US|MTH_OAS;SNOMEDCT_US|MTH_IS;CCPSS|TC;SCTSPA|OP;SCTSPA|OAF;SCTSPA|OAP;SCTSPA|OAS;SCTSPA|OF;SCTSPA|IS;SCTSPA|MTH_OP;SCTSPA|MTH_OAF;SCTSPA|MTH_OAP;SCTSPA|MTH_OAS;SCTSPA|MTH_OF;SCTSPA|MTH_IS;MSHNOR|DSV;MSHGER|DSV;MDRSPA|OL;MDRSPA|AB;MDRDUT|OL;MDRDUT|AB;MDRFRE|OL;MDRFRE|AB;MDRGER|OL;MDRGER|AB;MDRITA|OL;MDRITA|AB;MDRJPN|OL;MDRJPN|OLJKN;MDRJPN|OLJKN1;MDRCZE|OL;MDRHUN|OL;MDRPOR|OL;MDRCZE|AB;MDRHUN|AB;MDRPOR|AB;LNC-DE-CH|OOSN;LNC-DE-DE|LO;LNC-EL-GR|LO;LNC-ES-AR|LO;LNC-ES-AR|OOSN;LNC-ES-CH|OOSN;LNC-ES-ES|LO;LNC-ET-EE|LO;LNC-FR-BE|LO;LNC-FR-CA|LO;LNC-FR-CH|OOSN;LNC-FR-FR|OLC;LNC-FR-FR|LO;LNC-IT-CH|OOSN;LNC-IT-IT|LO;LNC-KO-KR|LO;LNC-NL-NL|LO;LNC-PT-BR|OLC;LNC-PT-BR|LO;LNC-PT-BR|OOSN;LNC-RU-RU|LO;LNC-TR-TR|LO;LNC-ZH-CN|LO;LNC-DE-AT|LO gov.nih.nlm.umls.mmsys.filter.SourceListFilter.enforce_dep_source_selection=true -""" # noqa +""" # ============================================================================= @@ -524,7 +524,7 @@ def get_mmsys_configfile_text( awk -F '\\|' '{print $1"|"$1}' | perl -pe 's/\n/;/g' \ > /tmp/sab_list.txt - """ # noqa + """ # noqa: E501 # Config file that we'll use as a starting point start_config_filename = join(mmsys_home, "config", release, "user.a.prop") diff --git a/crate_anon/nlp_manager/processor_helpers.py b/crate_anon/nlp_manager/processor_helpers.py new file mode 100644 index 00000000..752de8c1 --- /dev/null +++ b/crate_anon/nlp_manager/processor_helpers.py @@ -0,0 +1,69 @@ +""" +crate_anon/nlp_manager/processor_helpers.py + +=============================================================================== + + Copyright (C) 2015, University of Cambridge, Department of Psychiatry. + Created by Rudolf Cardinal (rnc1001@cam.ac.uk). + + This file is part of CRATE. + + CRATE is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + CRATE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with CRATE. If not, see . + +=============================================================================== + +**Helper functions to manage all NLP processor classes.** + +These use a delayed import, sorting out some circular import problems. + +""" + +# ============================================================================= +# Imports +# ============================================================================= + +from typing import Optional + +from crate_anon.nlp_manager.base_nlp_parser import TableMaker + + +# ============================================================================= +# Helper functions +# ============================================================================= + + +def make_nlp_parser_unconfigured( + classname: str, raise_if_absent: bool = True +) -> Optional[TableMaker]: + """ + Get a debugging (unconfigured) instance of an NLP parser. + + Args: + classname: the name of the NLP parser class + raise_if_absent: raise ``ValueError`` if there is no match? + + Returns: + the class, or ``None`` if there isn't one with that name + + """ + from crate_anon.nlp_manager.all_processors import ( + get_nlp_parser_class, + ) # delayed import + + cls = get_nlp_parser_class(classname) + if cls: + return cls(nlpdef=None, cfg_processor_name=None) + if raise_if_absent: + raise ValueError(f"Unknown NLP processor type: {classname!r}") + return None diff --git a/crate_anon/nlp_manager/regex_parser.py b/crate_anon/nlp_manager/regex_parser.py index aef4bdba..acc769cf 100644 --- a/crate_anon/nlp_manager/regex_parser.py +++ b/crate_anon/nlp_manager/regex_parser.py @@ -90,8 +90,8 @@ # "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting." # ... which we don't want to be interpreted as "sodium 1.0". # HOW BEST TO DO THIS? -# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa -# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa +# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets # noqa: E501 +# https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis # noqa: E501 # - ... simplest is perhaps: base ignorables, or those with brackets, as above # - ... even better than a nested thing is just a list of alternatives @@ -958,7 +958,7 @@ def __init__( \s* {OUT_OF_SEPARATOR} \s* ( {IGNORESIGN_INTEGER} ) # 5. group for denominator )? - """ # noqa + """ # noqa: E501 if debug: log.debug(f"Regex for {self.classname()}: {regex_str}") self.regex_str = regex_str diff --git a/crate_anon/nlp_manager/tests/cloud_request_process_tests.py b/crate_anon/nlp_manager/tests/cloud_request_process_tests.py index 679d1937..2e4d03d5 100644 --- a/crate_anon/nlp_manager/tests/cloud_request_process_tests.py +++ b/crate_anon/nlp_manager/tests/cloud_request_process_tests.py @@ -25,19 +25,58 @@ Unit tests. +Reminder: to enable logging, use e.g. pytest -k [testname] --log-cli-level=INFO + """ +import json import logging +import os +from pathlib import Path import sys from unittest import mock, TestCase +from tempfile import TemporaryDirectory +from typing import Any, Dict from cardinal_pythonlib.httpconst import HttpStatus +from sqlalchemy.engine import create_engine from sqlalchemy.exc import OperationalError +from sqlalchemy.schema import Column -from crate_anon.nlprp.constants import ( - NlprpKeys as NKeys, +from crate_anon.nlp_manager.all_processors import ( + register_all_crate_python_processors_with_serverprocessor, +) +from crate_anon.nlp_manager.cloud_parser import Cloud +from crate_anon.nlp_manager.cloud_request import ( + CloudRequest, + CloudRequestListProcessors, + CloudRequestProcess, +) +from crate_anon.nlp_manager.cloud_run_info import CloudRunInfo +from crate_anon.nlp_manager.constants import ( + CloudNlpConfigKeys, + DatabaseConfigKeys, + InputFieldConfigKeys, + NLP_CONFIG_ENV_VAR, + NlpConfigPrefixes, + NlpDefConfigKeys, + NlpDefValues, + NlpOutputConfigKeys, + ProcessorConfigKeys, ) -from crate_anon.nlp_manager.cloud_request import CloudRequestProcess +from crate_anon.nlp_manager.nlp_definition import NlpDefinition +from crate_anon.nlp_manager.nlp_manager import drop_remake, process_cloud_now +from crate_anon.nlp_webserver.server_processor import ServerProcessor +import crate_anon.nlp_webserver.tasks +from crate_anon.nlp_webserver.views import NlpWebViews +from crate_anon.nlprp.constants import NlprpKeys, NlprpValues + +log = logging.getLogger(__name__) + + +# ============================================================================= +# CloudRequestProcessTests +# ============================================================================= class CloudRequestProcessTests(TestCase): @@ -76,10 +115,10 @@ def test_process_all_inserts_values(self) -> None: ("output", {"fruit": "fig"}, self.mock_processor), ] - mock_get_nlp_values_method = mock.Mock(return_value=iter(nlp_values)) + mock_gen_nlp_values_method = mock.Mock(return_value=iter(nlp_values)) with mock.patch.multiple( - self.process, get_nlp_values=mock_get_nlp_values_method + self.process, gen_nlp_values=mock_gen_nlp_values_method ): self.process.process_all() @@ -118,10 +157,10 @@ def test_process_all_handles_failed_insert(self) -> None: "Insert failed", None, None, None ) - mock_get_nlp_values_method = mock.Mock(return_value=iter(nlp_values)) + mock_gen_nlp_values_method = mock.Mock(return_value=iter(nlp_values)) with self.assertLogs(level=logging.ERROR) as logging_cm: with mock.patch.multiple( - self.process, get_nlp_values=mock_get_nlp_values_method + self.process, gen_nlp_values=mock_gen_nlp_values_method ): self.process.process_all() @@ -163,8 +202,8 @@ def test_ready_for_status_ok(self) -> None: self.process.queue_id = "queue_0001" response = { - NKeys.STATUS: HttpStatus.OK, - NKeys.VERSION: "0.3.0", + NlprpKeys.STATUS: HttpStatus.OK, + NlprpKeys.VERSION: "0.3.0", } with mock.patch.object( @@ -177,8 +216,8 @@ def test_not_ready_when_old_server_status_processing(self) -> None: self.process.queue_id = "queue_0001" response = { - NKeys.STATUS: HttpStatus.PROCESSING, - NKeys.VERSION: "0.2.0", + NlprpKeys.STATUS: HttpStatus.PROCESSING, + NlprpKeys.VERSION: "0.2.0", } with mock.patch.object( @@ -191,8 +230,8 @@ def test_not_ready_when_new_server_status_accepted(self) -> None: self.process.queue_id = "queue_0001" response = { - NKeys.STATUS: HttpStatus.ACCEPTED, - NKeys.VERSION: "0.3.0", + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: "0.3.0", } with mock.patch.object( @@ -205,8 +244,8 @@ def test_not_ready_when_server_status_not_found(self) -> None: self.process.queue_id = "queue_0001" response = { - NKeys.STATUS: HttpStatus.NOT_FOUND, - NKeys.VERSION: "0.3.0", + NlprpKeys.STATUS: HttpStatus.NOT_FOUND, + NlprpKeys.VERSION: "0.3.0", } with mock.patch.object( @@ -221,8 +260,8 @@ def test_not_ready_when_server_status_anything_else(self) -> None: self.process.queue_id = "queue_0001" response = { - NKeys.STATUS: HttpStatus.FORBIDDEN, - NKeys.VERSION: "0.3.0", + NlprpKeys.STATUS: HttpStatus.FORBIDDEN, + NlprpKeys.VERSION: "0.3.0", } with mock.patch.object( @@ -232,3 +271,398 @@ def test_not_ready_when_server_status_anything_else(self) -> None: ready = self.process.check_if_ready() self.assertFalse(ready) self.assertIn("Got HTTP status code 403", logging_cm.output[0]) + + +# ============================================================================= +# CloudRequestListProcessorsTests +# ============================================================================= + +# A real one that wasn't working, 2024-12-16, with keys parameterized and +# boolean values Pythonized. +TEST_REMOTE_TABLE_SMOKING = "Smoking:Smoking" +TEST_SMOKING_PROC_NAME = "smoking" +TEST_SMOKING_PROC_VERSION = "0.1" +TEST_SMOKING_PROCINFO = { + NlprpKeys.DESCRIPTION: "A description", + NlprpKeys.IS_DEFAULT_VERSION: True, + NlprpKeys.NAME: TEST_SMOKING_PROC_NAME, + NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR, + NlprpKeys.SQL_DIALECT: "mssql", + NlprpKeys.TABULAR_SCHEMA: { + TEST_REMOTE_TABLE_SMOKING: [ + { + NlprpKeys.COLUMN_NAME: "start_", + NlprpKeys.COLUMN_TYPE: "BIGINT", + NlprpKeys.DATA_TYPE: "BIGINT", + NlprpKeys.IS_NULLABLE: False, + }, + { + NlprpKeys.COLUMN_NAME: "end_", + NlprpKeys.COLUMN_TYPE: "BIGINT", + NlprpKeys.DATA_TYPE: "BIGINT", + NlprpKeys.IS_NULLABLE: False, + }, + { + NlprpKeys.COLUMN_NAME: "who", + NlprpKeys.COLUMN_TYPE: "NVARCHAR(255)", + NlprpKeys.DATA_TYPE: "NVARCHAR", + NlprpKeys.IS_NULLABLE: True, + }, + { + NlprpKeys.COLUMN_NAME: "rule", + NlprpKeys.COLUMN_TYPE: "VARCHAR(50)", + NlprpKeys.DATA_TYPE: "VARCHAR", + NlprpKeys.IS_NULLABLE: True, + }, + { + NlprpKeys.COLUMN_NAME: "status", + NlprpKeys.COLUMN_TYPE: "VARCHAR(10)", + NlprpKeys.DATA_TYPE: "VARCHAR", + NlprpKeys.IS_NULLABLE: True, + }, + ] + }, + NlprpKeys.TITLE: "Smoking Status Annotator", + NlprpKeys.VERSION: TEST_SMOKING_PROC_VERSION, +} + + +class CloudRequestListProcessorsTests(TestCase): + def setUp(self) -> None: + self.mock_nlpdef = mock.Mock(name="mock_nlpdef") + self.mock_nlpdef.name = "testlistprocdef" + self.process = CloudRequestListProcessors(nlpdef=self.mock_nlpdef) + self.test_version = "0.3.0" + + def test_processors_key_missing(self) -> None: + response = { + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: self.test_version, + # Missing: NKeys.PROCESSORS + } + with mock.patch.object( + self.process, "_post_get_json", return_value=response + ): + with self.assertRaises(KeyError): + self.process.get_remote_processors() + + def test_processors_not_list(self) -> None: + response = { + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: self.test_version, + NlprpKeys.PROCESSORS: "XXX", # not a list + } + with mock.patch.object( + self.process, "_post_get_json", return_value=response + ): + with self.assertRaises(ValueError): + self.process.get_remote_processors() + + def test_procinfo_not_dict(self) -> None: + procinfo = "xxx" # not a dict + response = { + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: self.test_version, + NlprpKeys.PROCESSORS: [procinfo], + } + with mock.patch.object( + self.process, "_post_get_json", return_value=response + ): + with self.assertRaises(ValueError): + self.process.get_remote_processors() + + def test_procinfo_missing_keys(self) -> None: + mandatory_keys = ( + NlprpKeys.NAME, + NlprpKeys.TITLE, + NlprpKeys.VERSION, + NlprpKeys.DESCRIPTION, + ) + base_procinfo = {k: "x" for k in mandatory_keys} + for key in mandatory_keys: + procinfo = base_procinfo.copy() + del procinfo[key] + response = { + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: self.test_version, + NlprpKeys.PROCESSORS: [procinfo], + } + with mock.patch.object( + self.process, "_post_get_json", return_value=response + ): + with self.assertRaises(KeyError): + self.process.get_remote_processors() + + def test_procinfo_smoking(self) -> None: + response = { + NlprpKeys.STATUS: HttpStatus.ACCEPTED, + NlprpKeys.VERSION: self.test_version, + NlprpKeys.PROCESSORS: [TEST_SMOKING_PROCINFO], + } + with mock.patch.object( + self.process, "_post_get_json", return_value=response + ): + self.process.get_remote_processors() + # Should be happy. + # Clean up: + ServerProcessor.debug_remove_processor( + name=TEST_SMOKING_PROC_NAME, version=TEST_SMOKING_PROC_VERSION + ) + + +# ============================================================================= +# CloudRequestDataTests +# ============================================================================= + + +class CloudRequestDataTests(TestCase): + def setUp(self) -> None: + # On-disk database + self.tempdir = TemporaryDirectory() # will be deleted automatically + self.dbfilepath = Path(self.tempdir.name, "test.sqlite") + log.info(f"Using temporary database: {self.dbfilepath}") + self.dburl = f"sqlite:///{self.dbfilepath.absolute()}" + self.txttable = "notes" + self.pkcol = "pk" + self.txtcol = "note" + self.echo = False + self._mk_test_data() + + # Dummy database + self.dummy_engine = create_engine("sqlite://") + + # Config file + self.nlpdefname = "mynlp" + self.dbsectionname = "mydb" + self.cloudconfigname = "mycloud" + self.inputname = "myinput" + self.cloudclassname = "Cloud" + self.cloudproc_crp = "proc_crp" + self.cloudproc_alcohol = "proc_alcohol" + self.output_crp = "crp_output" + self.output_alcohol = "alcohol_output" + self.configfilepath = Path(self.tempdir.name, "crate_test_nlp.ini") + with open(self.configfilepath, "wt") as f: + configtext = self._mk_nlp_config() + log.debug(configtext) + f.write(configtext) + # import pdb; pdb.set_trace() + + # Server side + register_all_crate_python_processors_with_serverprocessor() + self.mock_pyramid_request = mock.Mock(name="mock_pyramid_request") + self.server = NlpWebViews(request=self.mock_pyramid_request) + self.server._authenticate = mock.Mock() + self.server._set_body_json_from_request = mock.Mock( + name="mock_set_body_json_from_request" + ) + + # Rather than modify the instances, let's try to modify the class. This + # is because process_now() does its own instance creation. + # (CloudRequest is the base class of CloudRequestProcess and + # CloudRequestListProcessors.) + CloudRequest._post_get_json = self._get_server_response + + # Client side #1 + self.mock_nlpdef = mock.Mock() + self.mock_nlpdef.name = "testdef" + self.listprocclient = CloudRequestListProcessors( + nlpdef=self.mock_nlpdef + ) + + # Client side #2 + os.environ[NLP_CONFIG_ENV_VAR] = str(self.configfilepath.absolute()) + self.nlpdef = NlpDefinition(self.nlpdefname) # loads the config + self.crinfo = CloudRunInfo(nlpdef=self.nlpdef) + + def _mk_nlp_config(self) -> str: + """ + Returns a test NLP config file. + """ + return f""" +# NLP definitions + +[{NlpConfigPrefixes.NLPDEF}:{self.nlpdefname}] +{NlpDefConfigKeys.INPUTFIELDDEFS} = + {self.inputname} +{NlpDefConfigKeys.PROCESSORS} = + {self.cloudclassname} {self.cloudproc_crp} + {self.cloudclassname} {self.cloudproc_alcohol} +{NlpDefConfigKeys.PROGRESSDB} = {self.dbsectionname} +{NlpDefConfigKeys.HASHPHRASE} = blah +{NlpDefConfigKeys.CLOUD_CONFIG} = {self.cloudconfigname} +{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {self.tempdir.name} + +# Inputs + +[{NlpConfigPrefixes.INPUT}:{self.inputname}] +{InputFieldConfigKeys.SRCDB} = {self.dbsectionname} +{InputFieldConfigKeys.SRCTABLE} = {self.txttable} +{InputFieldConfigKeys.SRCPKFIELD} = {self.pkcol} +{InputFieldConfigKeys.SRCFIELD} = {self.txtcol} + +# Processors + +# - CRP +[{NlpConfigPrefixes.PROCESSOR}:{self.cloudproc_crp}] +{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp +{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD} +{ProcessorConfigKeys.OUTPUTTYPEMAP} = + crp {self.output_crp} +{ProcessorConfigKeys.DESTDB} = {self.dbsectionname} + +# - Alcohol units +[{NlpConfigPrefixes.PROCESSOR}:{self.cloudproc_alcohol}] +{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_substance_misuse.AlcoholUnits +{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD} +{ProcessorConfigKeys.OUTPUTTYPEMAP} = + alcoholunits {self.output_alcohol} +{ProcessorConfigKeys.DESTDB} = {self.dbsectionname} + +# Output sections + +# - CRP +[{NlpConfigPrefixes.OUTPUT}:{self.output_crp}] +{NlpOutputConfigKeys.DESTTABLE} = nlp_crp + +# - Alcohol units +[{NlpConfigPrefixes.OUTPUT}:{self.output_alcohol}] +{NlpOutputConfigKeys.DESTTABLE} = nlp_alcohol + +# Databases + +[{NlpConfigPrefixes.DATABASE}:{self.dbsectionname}] +{DatabaseConfigKeys.URL} = {self.dburl} +{DatabaseConfigKeys.ECHO} = {self.echo} + +# Cloud servers + +[{NlpConfigPrefixes.CLOUD}:{self.cloudconfigname}] +{CloudNlpConfigKeys.CLOUD_URL} = https://dummy_url + +""" # noqa: E501 + + def _mk_test_data(self) -> None: + """ + Inserts some test data into a table. + """ + texts = ["Current CRP 7. Teetotal. Non-smoker."] + engine = create_engine(self.dburl) + with engine.connect() as con: + con.execute( + f""" + CREATE TABLE {self.txttable} ( + {self.pkcol} INTEGER, + {self.txtcol} TEXT + ) + """ + ) + for i, text in enumerate(texts, start=1): + con.execute( + f""" + INSERT INTO {self.txttable} + ({self.pkcol}, {self.txtcol}) + VALUES(:1, :2) + """, + i, + text, + ) + + # noinspection PyUnusedLocal + def _get_server_response( + self, request_json_str: str, may_fail: bool = None + ) -> Dict[str, Any]: + """ + Take a JSON request that has come from our mock client (in string + form), and return a JSON response from our mock server (in dictionary + form). + """ + request_json = json.loads(request_json_str) + log.warning(f"{request_json=}") # *** + self.server.body = request_json + response_json = self.server.index() + log.warning(f"-> {response_json=}") # *** + return response_json + + def test_get_remote_processor_columns(self) -> None: + """ + Check that a client can request processor definitions from the server + (testing both ends, just simplifying some communication between them, + e.g. removing authentication). Check that the client can synthesise + SQLAlchemy Column objects from the results. + """ + # Fetch the data + processors = self.listprocclient.get_remote_processors() + # Check it + self.assertIsInstance(processors, list) + for sp in processors: + self.assertIsInstance(sp, ServerProcessor) + log.debug(f"+++ Trying {sp.name=}") + + # We won't go so far as to set up a mock database in full. But + # check that Column object are created. + # (a) setup + c = Cloud(nlpdef=None, cfg_processor_name=None) + c.procname = sp.name + c._destdb = mock.Mock(name="mock_destdb") + c._destdb.engine = self.dummy_engine + c.set_procinfo_if_correct(sp) + log.debug(f"--- {c.schema=}") + for tablename in c.schema.keys(): + ouc = mock.Mock(name=f"mock_ouc_{tablename}") + ouc.get_columns = lambda _engine: [] + ouc.renames = {} + ouc.dest_tablename = tablename # usually a property + ouc.destfields = [] + # ... simulating OutputUserConfig + c._outputtypemap[tablename] = ouc + c._type_to_tablename[tablename] = tablename + # (b) test + self.assertTrue(c.is_tabular()) + table_columns = c.dest_tables_columns() + self.assertTrue(len(table_columns) > 0) + for tablename, columns in table_columns.items(): + log.debug(f"--- {sp.name=}: {tablename=}; {columns=}") + self.assertTrue(len(columns) > 0) + for col in columns: + self.assertIsInstance(col, Column) + + def test_cloud_pipeline_dict_format(self) -> None: + """ + Test the full pipeline: + + - create a source database (in setUp); + - build a config file (in setUp); + - create destination tables, based on remote processor definitions + using tabular_schema; + - run data through cloud NLP, and insert results. + + """ + prev = crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT + crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = True + drop_remake(nlpdef=self.nlpdef) + process_cloud_now(crinfo=self.crinfo) + # The test is (currently) that it doesn't crash. + # Reset for other tests: + crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = prev + + # To explore the database manually: + # import pdb; pdb.set_trace() + + def test_cloud_pipeline_list_format(self) -> None: + """ + Test the full pipeline: + + - create a source database (in setUp); + - build a config file (in setUp); + - create destination tables, based on remote processor definitions + using tabular_schema; + - run data through cloud NLP, and insert results. + + """ + prev = crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT + crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = False + drop_remake(nlpdef=self.nlpdef) + process_cloud_now(crinfo=self.crinfo) + # Reset for other tests: + crate_anon.nlp_webserver.tasks.USE_DICT_FORMAT_NLPRP_RESULT = prev diff --git a/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py b/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py index 8142ec4b..8a6c0e91 100644 --- a/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py +++ b/crate_anon/nlp_manager/tests/cloud_request_sender_tests.py @@ -32,17 +32,17 @@ from crate_anon.nlp_manager.cloud_request import CloudRequestProcess from crate_anon.nlp_manager.cloud_request_sender import CloudRequestSender -from crate_anon.nlp_manager.constants import HashClass -from crate_anon.nlp_manager.input_field_config import ( +from crate_anon.nlp_manager.constants import ( FN_SRCDB, FN_SRCFIELD, FN_SRCPKFIELD, FN_SRCPKSTR, FN_SRCPKVAL, FN_SRCTABLE, + HashClass, ) from crate_anon.nlp_manager.models import FN_SRCHASH, NlpRecord -from crate_anon.nlprp.constants import NlprpKeys as NKeys +from crate_anon.nlprp.constants import NlprpKeys PANAMOWA = "A woman, a plan, a canal. Panamowa!" PAGODA = "A dog! A panic in a pagoda." @@ -163,14 +163,17 @@ def test_single_text_sent_in_single_request(self) -> None: include_text_in_reply=True, # has_gate_processors from config ) - records = cloud_requests[0]._request_process[NKeys.ARGS][NKeys.CONTENT] + records = cloud_requests[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] - self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKVAL], 1) - self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKSTR], "pkstr") + self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKVAL], 1) + self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKSTR], "pkstr") self.assertEqual( - records[0][NKeys.METADATA][FN_SRCHASH], self.hasher.hash(PANAMOWA) + records[0][NlprpKeys.METADATA][FN_SRCHASH], + self.hasher.hash(PANAMOWA), ) - self.assertEqual(records[0][NKeys.TEXT], PANAMOWA) + self.assertEqual(records[0][NlprpKeys.TEXT], PANAMOWA) def test_multiple_records_sent_in_single_request(self) -> None: self.test_text = [ @@ -222,11 +225,13 @@ def test_multiple_records_sent_in_single_request(self) -> None: include_text_in_reply=True, # has_gate_processors ) - records = cloud_requests[0]._request_process[NKeys.ARGS][NKeys.CONTENT] + records = cloud_requests[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] - self.assertEqual(records[0][NKeys.METADATA][FN_SRCPKVAL], 1) - self.assertEqual(records[1][NKeys.METADATA][FN_SRCPKSTR], "pkstr") - self.assertEqual(records[2][NKeys.TEXT], REVOLT) + self.assertEqual(records[0][NlprpKeys.METADATA][FN_SRCPKVAL], 1) + self.assertEqual(records[1][NlprpKeys.METADATA][FN_SRCPKSTR], "pkstr") + self.assertEqual(records[2][NlprpKeys.TEXT], REVOLT) def test_max_records_per_request(self) -> None: self.test_text = [ @@ -318,14 +323,20 @@ def mock_send_0_side_effect(*args, **kwargs): include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA) + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA) - content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_1[0][NKeys.TEXT], PAGODA) + content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_1[0][NlprpKeys.TEXT], PAGODA) - content_2 = requests_out[2]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_2[0][NKeys.TEXT], REVOLT) + content_2 = requests_out[2]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_2[0][NlprpKeys.TEXT], REVOLT) logger_name = "crate_anon.nlp_manager.cloud_request_sender" expected_message = "Sent request to be processed: #1 of this block" @@ -397,11 +408,13 @@ def test_limit_before_commit_2(self) -> None: include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] self.assertEqual(len(content_0), 2) - self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA) + self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA) - self.assertEqual(content_0[1][NKeys.TEXT], PAGODA) + self.assertEqual(content_0[1][NlprpKeys.TEXT], PAGODA) def test_max_content_length(self) -> None: self.test_text = [ @@ -473,13 +486,17 @@ def test_max_content_length(self) -> None: include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA) + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA) - self.assertEqual(content_0[1][NKeys.TEXT], PAGODA) + self.assertEqual(content_0[1][NlprpKeys.TEXT], PAGODA) - content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_1[0][NKeys.TEXT], REVOLT) + content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_1[0][NlprpKeys.TEXT], REVOLT) def test_record_bigger_than_max_content_length_skipped(self) -> None: short_text = "Some text with serialized length greater than 500. " @@ -553,12 +570,16 @@ def test_record_bigger_than_max_content_length_skipped(self) -> None: include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] self.assertEqual(len(content_0), 1) - self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA) + self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA) - content_1 = requests_out[1]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_1[0][NKeys.TEXT], REVOLT) + content_1 = requests_out[1]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_1[0][NlprpKeys.TEXT], REVOLT) logger_name = "crate_anon.nlp_manager.cloud_request_sender" self.assertIn( @@ -643,8 +664,10 @@ def get_progress_record(pkval: int, pkstr: str) -> Optional[NlpRecord]: include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] - self.assertEqual(content_0[0][NKeys.TEXT], PAGODA) + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] + self.assertEqual(content_0[0][NlprpKeys.TEXT], PAGODA) logger_name = "crate_anon.nlp_manager.cloud_request_sender" self.assertIn( @@ -821,8 +844,10 @@ def test_record_with_no_text_skipped(self) -> None: include_text_in_reply=True, # has_gate_processors from config ) - content_0 = requests_out[0]._request_process[NKeys.ARGS][NKeys.CONTENT] + content_0 = requests_out[0]._request_process[NlprpKeys.ARGS][ + NlprpKeys.CONTENT + ] self.assertEqual(len(content_0), 2) - self.assertEqual(content_0[0][NKeys.TEXT], PANAMOWA) + self.assertEqual(content_0[0][NlprpKeys.TEXT], PANAMOWA) - self.assertEqual(content_0[1][NKeys.TEXT], REVOLT) + self.assertEqual(content_0[1][NlprpKeys.TEXT], REVOLT) diff --git a/crate_anon/nlp_manager/tests/nlp_parser_tests.py b/crate_anon/nlp_manager/tests/nlp_parser_tests.py index fe04a843..e454bdc1 100644 --- a/crate_anon/nlp_manager/tests/nlp_parser_tests.py +++ b/crate_anon/nlp_manager/tests/nlp_parser_tests.py @@ -37,6 +37,8 @@ class FruitParser(BaseNlpParser): + is_test_nlp_parser = True + def test(self, verbose: bool = False) -> None: pass diff --git a/crate_anon/nlp_webserver/procs.py b/crate_anon/nlp_webserver/procs.py index 77cb837d..3a223ca9 100644 --- a/crate_anon/nlp_webserver/procs.py +++ b/crate_anon/nlp_webserver/procs.py @@ -33,10 +33,7 @@ from crate_anon.common.constants import EnvVar from crate_anon.nlprp.constants import NlprpKeys -from crate_anon.nlp_webserver.constants import ( - KEY_PROCTYPE, - NlpServerConfigKeys, -) +from crate_anon.nlp_webserver.constants import NlpServerConfigKeys from crate_anon.nlp_webserver.server_processor import ServerProcessor from crate_anon.nlp_webserver.settings import SETTINGS @@ -57,16 +54,7 @@ _name = _proc[NlprpKeys.NAME] _version = _proc[NlprpKeys.VERSION] log.info(f"Registering NLPRP processor {_name!r} (v{_version})") - _x = ServerProcessor( - name=_name, - title=_proc[NlprpKeys.TITLE], - version=_version, - is_default_version=_proc[NlprpKeys.IS_DEFAULT_VERSION], - description=_proc[NlprpKeys.DESCRIPTION], - proctype=_proc.get(KEY_PROCTYPE), # may be None - schema_type=_proc[NlprpKeys.SCHEMA_TYPE], # 'unknown' or 'tabular' - sql_dialect=_proc.get(NlprpKeys.SQL_DIALECT), - tabular_schema=_proc.get(NlprpKeys.TABULAR_SCHEMA), - ) # registers with the ServerProcessor class - # Doing this here saves time per request + _x = ServerProcessor.from_nlprp_json_dict(_proc) + # ... registers with the ServerProcessor class + # Doing this here saves time per request: _x.set_parser() diff --git a/crate_anon/nlp_webserver/server_processor.py b/crate_anon/nlp_webserver/server_processor.py index c2e92af4..541f0f4b 100644 --- a/crate_anon/nlp_webserver/server_processor.py +++ b/crate_anon/nlp_webserver/server_processor.py @@ -30,11 +30,17 @@ from typing import Dict, Optional, Any from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser -from crate_anon.nlp_manager.all_processors import make_nlp_parser_unconfigured +from crate_anon.nlp_manager.processor_helpers import ( + make_nlp_parser_unconfigured, +) from crate_anon.nlprp.api import JsonObjectType, NlprpServerProcessor from crate_anon.nlprp.constants import NlprpKeys, NlprpValues from crate_anon.nlprp.errors import BAD_REQUEST, mkerror, no_such_proc_error -from crate_anon.nlp_webserver.constants import PROCTYPE_GATE, GATE_BASE_URL +from crate_anon.nlp_webserver.constants import ( + KEY_PROCTYPE, + PROCTYPE_GATE, + GATE_BASE_URL, +) class ServerProcessor(NlprpServerProcessor): @@ -93,9 +99,38 @@ def __init__( # Add instance to list of processors ServerProcessor.processors[self.processor_id] = self + @classmethod + def from_nlprp_json_dict( + cls, processor_dict: Dict[str, Any] + ) -> NlprpServerProcessor: + pd = processor_dict # shorthand + return ServerProcessor( + name=pd[NlprpKeys.NAME], + title=pd[NlprpKeys.TITLE], + version=pd[NlprpKeys.VERSION], + is_default_version=pd[NlprpKeys.IS_DEFAULT_VERSION], + description=pd[NlprpKeys.DESCRIPTION], + proctype=pd.get(KEY_PROCTYPE), # may be None + schema_type=pd[NlprpKeys.SCHEMA_TYPE], # 'unknown' or 'tabular' + sql_dialect=pd.get(NlprpKeys.SQL_DIALECT), + tabular_schema=pd.get(NlprpKeys.TABULAR_SCHEMA), + ) # also registers with the ServerProcessor class + + @classmethod + def debug_remove_processor(cls, name: str, version: str) -> None: + """ + For debugging purposes (testing). De-registers a processor. + """ + processor_id = cls._mk_processor_id(name, version) + cls.processors.pop(processor_id, None) # delete if present + + @classmethod + def _mk_processor_id(cls, name: str, version: str) -> str: + return f"{name}_{version}" + @property def processor_id(self) -> str: - return f"{self.name}_{self.version}" + return self._mk_processor_id(self.name, self.version) @classmethod def get_processor(cls, name: str, version: str = "") -> "ServerProcessor": diff --git a/crate_anon/nlp_webserver/tasks.py b/crate_anon/nlp_webserver/tasks.py index b53e2cd5..62a91d68 100644 --- a/crate_anon/nlp_webserver/tasks.py +++ b/crate_anon/nlp_webserver/tasks.py @@ -27,6 +27,7 @@ """ +from collections import defaultdict import datetime import json import logging @@ -66,6 +67,18 @@ log = logging.getLogger(__name__) +# ============================================================================= +# Constants +# ============================================================================= + +# Use the more explicit NLPRP dictionary (JSON object) result format, rather +# than the simpler list (JSON array) format? +USE_DICT_FORMAT_NLPRP_RESULT = True + +# Following on from that: +EMPTY_PROCESSOR_RESULT = {} if USE_DICT_FORMAT_NLPRP_RESULT else [] + + # ============================================================================= # SQLAlchemy and Celery setup # ============================================================================= @@ -122,7 +135,7 @@ def nlprp_processor_dict( processor: a :class:`crate_anon.nlp_webserver.procs.Processor`, or ``None`` results: - a JSON array of results + a JSON array (or dict) of results errcode: (if not ``success``) an integer error code errmsg: @@ -131,7 +144,10 @@ def nlprp_processor_dict( Returns: a JSON object in NLPRP format """ - proc_dict = {NlprpKeys.RESULTS: results or [], NlprpKeys.SUCCESS: success} + proc_dict = { + NlprpKeys.RESULTS: results or EMPTY_PROCESSOR_RESULT, + NlprpKeys.SUCCESS: success, + } if processor: proc_dict[NlprpKeys.NAME] = processor.name proc_dict[NlprpKeys.TITLE] = processor.title @@ -394,6 +410,12 @@ def process_nlp_internal( """ Send text to a chosen CRATE Python NLP processor and return a :class:`NlpServerResult`. + + Args: + text: + The text to process. + processor: + The NLP processor to use. """ parser = processor.parser try: @@ -402,8 +424,26 @@ def process_nlp_internal( return internal_error( f"parser is not a CRATE Python NLP parser; is {parser!r}" ) - # Get second element of each element in parsed text as first is tablename - # which will have no meaning here - return nlprp_processor_dict( - True, processor, results=[x[1] for x in tablename_valuedict_generator] - ) + if USE_DICT_FORMAT_NLPRP_RESULT: + results = defaultdict(list) + for tablename, tableresult in tablename_valuedict_generator: + results[tablename].append(tableresult) + # If we use defaultdict(list) here, unmodified, the default JSON + # serialiser messes up and products things like {'results': + # defaultdict(, {'alcoholunits': [{'variable_name': ... + # The code above is probably quicker as an iterator, but we can convert + # here: + results = dict(results) + else: + # Get second element of each element in parsed text as first is + # tablename which will have no meaning here + results = [] + tables_seen = set() + for tablename, tableresult in tablename_valuedict_generator: + results.append(tableresult) + tables_seen.add(tablename) + assert len(tables_seen) < 2, ( + "Internal bug: can't return results from multiple tables (within " + "a single NLP processor) in single-table list format" + ) + return nlprp_processor_dict(True, processor, results=results) diff --git a/crate_anon/nlp_webserver/views.py b/crate_anon/nlp_webserver/views.py index 106cd474..d7e6554e 100644 --- a/crate_anon/nlp_webserver/views.py +++ b/crate_anon/nlp_webserver/views.py @@ -67,7 +67,7 @@ ) from crate_anon.nlprp.constants import ( NlprpCommands, - NlprpKeys as NKeys, + NlprpKeys, NlprpValues, ) from crate_anon.nlprp.errors import ( @@ -190,7 +190,7 @@ def __init__(self, nlprp_request: JsonObjectType) -> None: # The processors being requested. We fetch all of them now, so they # can be iterated through fast for each document. requested_processors = json_get_array( - args, NKeys.PROCESSORS, required=True + args, NlprpKeys.PROCESSORS, required=True ) self.processors = [ ServerProcessor.get_processor_nlprp(d) @@ -198,18 +198,18 @@ def __init__(self, nlprp_request: JsonObjectType) -> None: ] # Queue? - self.queue = json_get_bool(args, NKeys.QUEUE, default=False) + self.queue = json_get_bool(args, NlprpKeys.QUEUE, default=False) # Client job ID self.client_job_id = json_get_str( - args, NKeys.CLIENT_JOB_ID, default="" + args, NlprpKeys.CLIENT_JOB_ID, default="" ) # Include the source text in the reply? - self.include_text = json_get_bool(args, NKeys.INCLUDE_TEXT) + self.include_text = json_get_bool(args, NlprpKeys.INCLUDE_TEXT) # Content: list of objects (each with text and metadata) - self.content = json_get_array(args, NKeys.CONTENT, required=True) + self.content = json_get_array(args, NlprpKeys.CONTENT, required=True) def processor_ids(self) -> List[str]: """ @@ -236,9 +236,9 @@ def gen_text_metadataobj( tuple: ``(text, metadata)``, as above """ for document in self.content: - text = json_get_str(document, NKeys.TEXT, required=True) + text = json_get_str(document, NlprpKeys.TEXT, required=True) metadata = json_get_value( - document, NKeys.METADATA, default=None, required=False + document, NlprpKeys.METADATA, default=None, required=False ) yield text, metadata @@ -252,16 +252,16 @@ def gen_text_metadatastr(self) -> Generator[Tuple[str, str], None, None]: """ try: for document in self.content: - text = json_get_str(document, NKeys.TEXT, required=True) + text = json_get_str(document, NlprpKeys.TEXT, required=True) metadata = json_get_value( - document, NKeys.METADATA, default=None, required=False + document, NlprpKeys.METADATA, default=None, required=False ) metadata_str = json.dumps( metadata, separators=JSON_SEPARATORS_COMPACT ) yield text, metadata_str except KeyError: - raise key_missing_error(key=NKeys.TEXT) + raise key_missing_error(key=NlprpKeys.TEXT) # ============================================================================= @@ -321,14 +321,14 @@ def create_response( # Put status in HTTP header self.set_http_response_status(status) response_dict = { - NKeys.STATUS: status, - NKeys.PROTOCOL: { - NKeys.NAME: NlprpValues.NLPRP_PROTOCOL_NAME, - NKeys.VERSION: NLPRP_VERSION_STRING, + NlprpKeys.STATUS: status, + NlprpKeys.PROTOCOL: { + NlprpKeys.NAME: NlprpValues.NLPRP_PROTOCOL_NAME, + NlprpKeys.VERSION: NLPRP_VERSION_STRING, }, - NKeys.SERVER_INFO: { - NKeys.NAME: SERVER_NAME, - NKeys.VERSION: SERVER_VERSION, + NlprpKeys.SERVER_INFO: { + NlprpKeys.NAME: SERVER_NAME, + NlprpKeys.VERSION: SERVER_VERSION, }, } if extra_info is not None: @@ -344,11 +344,11 @@ def create_error_response(self, error: NlprpError) -> JsonObjectType: # Turned 'errors' into array # Should this allow for multiple errors? error_info = { - NKeys.ERRORS: [ + NlprpKeys.ERRORS: [ { - NKeys.CODE: error.code, - NKeys.MESSAGE: error.message, - NKeys.DESCRIPTION: error.description, + NlprpKeys.CODE: error.code, + NlprpKeys.MESSAGE: error.message, + NlprpKeys.DESCRIPTION: error.description, } ] } @@ -450,7 +450,7 @@ def handle_nlprp_request(self) -> JsonObjectType: """ self._authenticate() self._set_body_json_from_request() - command = json_get_str(self.body, NKeys.COMMAND, required=True) + command = json_get_str(self.body, NlprpKeys.COMMAND, required=True) log.debug( f"NLPRP request received from {self.request.remote_addr}: " f"username={self.username}, command={command}" @@ -489,7 +489,7 @@ def list_processors(self) -> JsonObjectType: return self.create_response( status=HttpStatus.OK, extra_info={ - NKeys.PROCESSORS: [ + NlprpKeys.PROCESSORS: [ proc.infodict for proc in ServerProcessor.processors.values() ] @@ -518,23 +518,23 @@ def process_now( password=self.password, ) # proc_dict = procresult.nlprp_processor_dict(processor) - if procresult[NKeys.NAME] is None: - procresult[NKeys.NAME] = processor.name - procresult[NKeys.TITLE] = processor.title - procresult[NKeys.VERSION] = processor.version + if procresult[NlprpKeys.NAME] is None: + procresult[NlprpKeys.NAME] = processor.name + procresult[NlprpKeys.TITLE] = processor.title + procresult[NlprpKeys.VERSION] = processor.version processor_data.append(procresult) doc_result = { - NKeys.METADATA: metadata, - NKeys.PROCESSORS: processor_data, + NlprpKeys.METADATA: metadata, + NlprpKeys.PROCESSORS: processor_data, } if process_request.include_text: - doc_result[NKeys.TEXT] = text + doc_result[NlprpKeys.TEXT] = text results.append(doc_result) response_info = { - NKeys.CLIENT_JOB_ID: process_request.client_job_id, - NKeys.RESULTS: results, + NlprpKeys.CLIENT_JOB_ID: process_request.client_job_id, + NlprpKeys.RESULTS: results, } return self.create_response( status=HttpStatus.OK, extra_info=response_info @@ -601,7 +601,7 @@ def put_in_queue( task_id=dpr_id, # for Celery ) - response_info = {NKeys.QUEUE_ID: queue_id} + response_info = {NlprpKeys.QUEUE_ID: queue_id} return self.create_response( status=HttpStatus.ACCEPTED, extra_info=response_info ) @@ -615,7 +615,7 @@ def fetch_from_queue(self) -> JsonObjectType: # Args # --------------------------------------------------------------------- args = json_get_toplevel_args(self.body) - queue_id = json_get_str(args, NKeys.QUEUE_ID, required=True) + queue_id = json_get_str(args, NlprpKeys.QUEUE_ID, required=True) # --------------------------------------------------------------------- # Start with the DocProcRequests, because if some are still busy, @@ -642,8 +642,8 @@ def fetch_from_queue(self) -> JsonObjectType: return self.create_response( HttpStatus.ACCEPTED, { - NKeys.N_DOCPROCS: n, - NKeys.N_DOCPROCS_COMPLETED: n_done, + NlprpKeys.N_DOCPROCS: n, + NlprpKeys.N_DOCPROCS_COMPLETED: n_done, }, ) @@ -682,19 +682,19 @@ def get_processor_cached(_processor_id: str) -> ServerProcessor: # ... data for *all* the processors for this doc for dpr in doc.docprocrequests: procresult = json.loads(dpr.results) # type: Dict[str, Any] - if procresult[NKeys.NAME] is None: + if procresult[NlprpKeys.NAME] is None: processor = get_processor_cached(dpr.processor_id) - procresult[NKeys.NAME] = processor.name - procresult[NKeys.TITLE] = processor.title - procresult[NKeys.VERSION] = processor.version + procresult[NlprpKeys.NAME] = processor.name + procresult[NlprpKeys.TITLE] = processor.title + procresult[NlprpKeys.VERSION] = processor.version processor_data.append(procresult) metadata = json.loads(doc.client_metadata) doc_result = { - NKeys.METADATA: metadata, - NKeys.PROCESSORS: processor_data, + NlprpKeys.METADATA: metadata, + NlprpKeys.PROCESSORS: processor_data, } if doc.include_text: - doc_result[NKeys.TEXT] = doc.doctext + doc_result[NlprpKeys.TEXT] = doc.doctext doc_results.append(doc_result) # --------------------------------------------------------------------- @@ -706,10 +706,10 @@ def get_processor_cached(_processor_id: str) -> ServerProcessor: # ... will also delete the DocProcRequests via a cascade response_info = { - NKeys.CLIENT_JOB_ID: ( + NlprpKeys.CLIENT_JOB_ID: ( client_job_id if client_job_id is not None else "" ), - NKeys.RESULTS: doc_results, + NlprpKeys.RESULTS: doc_results, } return self.create_response( status=HttpStatus.OK, extra_info=response_info @@ -724,7 +724,7 @@ def show_queue(self) -> JsonObjectType: args = json_get_toplevel_args(self.body, required=False) if args: client_job_id = json_get_str( - args, NKeys.CLIENT_JOB_ID, default="", required=False + args, NlprpKeys.CLIENT_JOB_ID, default="", required=False ) else: client_job_id = "" @@ -764,15 +764,15 @@ def show_queue(self) -> JsonObjectType: queue_answer.append( { - NKeys.QUEUE_ID: queue_id, - NKeys.CLIENT_JOB_ID: client_job_id, - NKeys.STATUS: ( + NlprpKeys.QUEUE_ID: queue_id, + NlprpKeys.CLIENT_JOB_ID: client_job_id, + NlprpKeys.STATUS: ( NlprpValues.BUSY if busy else NlprpValues.READY ), - NKeys.DATETIME_SUBMITTED: pendulum_to_nlprp_datetime( + NlprpKeys.DATETIME_SUBMITTED: pendulum_to_nlprp_datetime( dt_submitted, to_utc=True ), - NKeys.DATETIME_COMPLETED: ( + NlprpKeys.DATETIME_COMPLETED: ( None if busy else pendulum_to_nlprp_datetime(max_time, to_utc=True) @@ -780,7 +780,7 @@ def show_queue(self) -> JsonObjectType: } ) return self.create_response( - status=HttpStatus.OK, extra_info={NKeys.QUEUE: queue_answer} + status=HttpStatus.OK, extra_info={NlprpKeys.QUEUE: queue_answer} ) def delete_from_queue(self) -> JsonObjectType: @@ -788,8 +788,8 @@ def delete_from_queue(self) -> JsonObjectType: Deletes from the queue all entries specified by the client. """ args = json_get_toplevel_args(self.body) - delete_all = json_get_bool(args, NKeys.DELETE_ALL, default=False) - client_job_ids = json_get_array_of_str(args, NKeys.CLIENT_JOB_IDS) + delete_all = json_get_bool(args, NlprpKeys.DELETE_ALL, default=False) + client_job_ids = json_get_array_of_str(args, NlprpKeys.CLIENT_JOB_IDS) # Establish what to cancel/delete q_dpr = ( diff --git a/crate_anon/nlprp/api.py b/crate_anon/nlprp/api.py index 199ace8c..e3ad3394 100644 --- a/crate_anon/nlprp/api.py +++ b/crate_anon/nlprp/api.py @@ -840,3 +840,10 @@ def __str__(self) -> str: def __repr__(self) -> str: return auto_repr(self) + + def is_tabular(self) -> bool: + """ + Is the format of the schema information given by the remote processor + tabular? + """ + return self.schema_type == NlprpValues.TABULAR diff --git a/crate_anon/nlprp/nlprp_test_client.py b/crate_anon/nlprp/nlprp_test_client.py index 3b6aeaa1..bc549569 100755 --- a/crate_anon/nlprp/nlprp_test_client.py +++ b/crate_anon/nlprp/nlprp_test_client.py @@ -110,7 +110,7 @@ def fail(msg: str) -> NoReturn: # "deflate" transfer-encodings from the server; see # http://docs.python-requests.org/en/master/user/quickstart/. # - The difference between gzip and deflate: - # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa + # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa: E501 log.debug( f"Reply has status_code={r.status_code}, headers={r.headers!r}, " f"text={r.text!r}" @@ -120,7 +120,7 @@ def fail(msg: str) -> NoReturn: # "r.text" automatically does gzip decode except ( ValueError - ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa + ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa: E501 fail("Reply was not JSON") response = None # for type checker log.debug(f"Response JSON decoded to: {response.dict!r}") diff --git a/crate_anon/preprocess/postcodes.py b/crate_anon/preprocess/postcodes.py index f5841844..f7ee56cc 100755 --- a/crate_anon/preprocess/postcodes.py +++ b/crate_anon/preprocess/postcodes.py @@ -61,7 +61,7 @@ - https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography#output-area-oa -""" # noqa +""" # noqa: E501 from abc import ABC, ABCMeta, abstractmethod import argparse @@ -268,7 +268,7 @@ class ExtendedBase: and thus define this class to inherit from those two metaclasses, so it can be the metaclass we want. - """ # noqa + """ # noqa: E501 pass @@ -671,7 +671,7 @@ class CASWard(Base): Represents censua area statistics (CAS) wards in the UK, 2003. - https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography#statistical-wards-cas-wards-and-st-wards - """ # noqa + """ # noqa: E501 __filename__ = "CAS ward names and codes UK as at 01_03.xlsx" __tablename__ = "cas_ward_2003" @@ -1107,9 +1107,9 @@ def __init__(self, **kwargs: Any) -> None: # ============================================================================= # Models: centroids # ============================================================================= -# https://webarchive.nationalarchives.gov.uk/20160105160709/https://www.ons.gov.uk/ons/guide-method/geography/products/census/spatial/centroids/index.html # noqa +# https://webarchive.nationalarchives.gov.uk/20160105160709/https://www.ons.gov.uk/ons/guide-method/geography/products/census/spatial/centroids/index.html # noqa: E501 # -# Looking at lower_layer_super_output_areas_(e+w)_2011_population_weighted_centroids_v2.zip : # noqa +# Looking at lower_layer_super_output_areas_(e+w)_2011_population_weighted_centroids_v2.zip : # noqa: E501 # - LSOA_2011_EW_PWC.shp -- probably a Shape file; # ... yes # ... https://en.wikipedia.org/wiki/Shapefile @@ -1124,7 +1124,7 @@ class PopWeightedCentroidsLsoa2011(Base): That is, the geographical centre of the LSOA, weighted by population. (A first approximation: imagine every person pulling on the centroid simultaneously and with equal force from their home. Where will it end up?) - """ # noqa + """ __filename__ = "LSOA_2011_EW_PWC_COORD_V2.CSV" __tablename__ = "pop_weighted_centroids_lsoa_2011" # __debug_content__ = True @@ -1133,7 +1133,7 @@ class PopWeightedCentroidsLsoa2011(Base): lsoa_name = Column(String(NAME_LEN)) bng_north = Column(Integer, comment="British National Grid, North (m)") bng_east = Column(Integer, comment="British National Grid, East (m)") - # https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid#All-numeric_grid_references # noqa + # https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid#All-numeric_grid_references # noqa: E501 latitude = Column(Numeric(precision=13, scale=10), comment="Latitude (degrees, 10dp)") longitude = Column(Numeric(precision=13, scale=10), diff --git a/crate_anon/preprocess/preprocess_pcmis.py b/crate_anon/preprocess/preprocess_pcmis.py index b5dd02c4..78b6d2b0 100755 --- a/crate_anon/preprocess/preprocess_pcmis.py +++ b/crate_anon/preprocess/preprocess_pcmis.py @@ -569,7 +569,7 @@ def get_pcmis_dd_settings(ddhint: DDHint) -> str: {sk.DDGEN_FORCE_LOWER_CASE} = False {sk.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = True -""" # noqa +""" # noqa: E501 # ============================================================================= diff --git a/crate_anon/preprocess/rio_ddgen.py b/crate_anon/preprocess/rio_ddgen.py index 7b1d1e9f..6fb240ba 100644 --- a/crate_anon/preprocess/rio_ddgen.py +++ b/crate_anon/preprocess/rio_ddgen.py @@ -392,4 +392,4 @@ def get_rio_dd_settings(ddhint: DDHint) -> str: {sk.DDGEN_FORCE_LOWER_CASE} = False {sk.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = True -""" # noqa +""" # noqa: E501 diff --git a/crate_anon/preprocess/rio_view_func.py b/crate_anon/preprocess/rio_view_func.py index 7bbec64e..b63e40ac 100644 --- a/crate_anon/preprocess/rio_view_func.py +++ b/crate_anon/preprocess/rio_view_func.py @@ -249,7 +249,7 @@ def lookup_from_fragment_first_row_outer_apply( - https://stackoverflow.com/questions/2043259/sql-server-how-to-join-to-first-row - https://stackoverflow.com/questions/9275132/real-life-example-when-to-use-outer-cross-apply-in-sql - """ # noqa + """ # noqa: E501 return ( f"OUTER APPLY (SELECT TOP 1 {', '.join(lookup_fields)} " f"FROM {lookup_table} " diff --git a/crate_anon/preprocess/systmone_ddgen.py b/crate_anon/preprocess/systmone_ddgen.py index bf67d219..be9884c4 100644 --- a/crate_anon/preprocess/systmone_ddgen.py +++ b/crate_anon/preprocess/systmone_ddgen.py @@ -410,7 +410,7 @@ et al. 2014, https://pubmed.ncbi.nlm.nih.gov/25261970/) perform similar researcher/data separation via other methods. -""" # noqa +""" # noqa: E501 # todo: SystmOne (CRATE traffic-light system): implement S1_ClinicalOutcome_ConsentResearch # noqa diff --git a/crate_anon/tools/launch_celery.py b/crate_anon/tools/launch_celery.py index 5264590d..d7f0c6d4 100755 --- a/crate_anon/tools/launch_celery.py +++ b/crate_anon/tools/launch_celery.py @@ -92,7 +92,7 @@ def inner_main() -> None: # Without "--pool=solo", sod all happens: tasks go into the # Reserved queue, but aren't executed. # See: - # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#module-celery.bin.worker # noqa + # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#module-celery.bin.worker # noqa: E501 # https://github.com/celery/celery/issues/2146 cmdargs += ["--pool=solo"] diff --git a/crate_anon/tools/print_crateweb_demo_config.py b/crate_anon/tools/print_crateweb_demo_config.py index badc271a..0658a687 100755 --- a/crate_anon/tools/print_crateweb_demo_config.py +++ b/crate_anon/tools/print_crateweb_demo_config.py @@ -76,21 +76,21 @@ # ============================================================================= # Site URL configuration # ============================================================================= -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 -# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa -# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa +# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa: E501 +# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa: E501 DJANGO_SITE_ROOT_ABSOLUTE_URL = "@@django_site_root_absolute_url@@" FORCE_SCRIPT_NAME = "@@force_script_name@@" # FORCE_SCRIPT_NAME = "" # example for Apache root hosting -# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa +# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa: E501 # ============================================================================= # Site security # ============================================================================= -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = "@@secret_key@@" @@ -399,12 +399,12 @@ def always_show_toolbar(request: "HttpRequest") -> bool: EMAIL_USE_SSL = False # Who will the e-mails appear to come from? -EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa +EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501 # ----------------------------------------------------------------------------- # Additional settings # ----------------------------------------------------------------------------- -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 # During development, we route all consent-related e-mails to the developer. # Switch SAFETY_CATCH_ON to False for production mode. diff --git a/crate_anon/version.py b/crate_anon/version.py index 913be140..aa90b4ed 100644 --- a/crate_anon/version.py +++ b/crate_anon/version.py @@ -34,10 +34,10 @@ # Constants # ============================================================================= -CRATE_VERSION = "0.20.5" +CRATE_VERSION = "0.20.6" CRATE_VERSION_DATE = "2024-06-26" -MINIMUM_PYTHON_VERSION = (3, 8) +MINIMUM_PYTHON_VERSION = (3, 9) # Only other place that has this: install_virtualenv.py (which can't import # CRATE packages). diff --git a/debugging/mssql_without_transaction.py b/debugging/mssql_without_transaction.py index 32805738..5e423b58 100755 --- a/debugging/mssql_without_transaction.py +++ b/debugging/mssql_without_transaction.py @@ -99,7 +99,7 @@ def execute(engine_or_conn: Union[Connection, Engine], sql: str) -> None: - SQLSetConnectAttr is an ODBC function: https://docs.microsoft.com/en-us/sql/odbc/reference/syntax/sqlsetconnectattr-function -""" # noqa +""" # noqa: E501 print("SQLAlchemy engine again:") show_query(engine, query_trancount) # 1 diff --git a/docker/dockerfiles/.env b/docker/dockerfiles/.env index 05455b91..177c8c36 100644 --- a/docker/dockerfiles/.env +++ b/docker/dockerfiles/.env @@ -24,7 +24,7 @@ CRATE_DOCKER_GATE_VERSION=9.0.1 # Need to keep in sync with crate_anon/version.py # Use e.g. -rc1 suffix in development -CRATE_DOCKER_IMAGE_TAG=crate:0.20.5 +CRATE_DOCKER_IMAGE_TAG=crate:0.20.6-rc3 CRATE_DOCKER_ODBC_USER_CONFIG=odbc_user.ini diff --git a/docs/recreate_inclusion_files.py b/docs/recreate_inclusion_files.py index bf590ede..33b691d3 100755 --- a/docs/recreate_inclusion_files.py +++ b/docs/recreate_inclusion_files.py @@ -404,7 +404,7 @@ def main(): sudo npm install -g n sudo n stable sudo npm install npm@latest -g -""" # noqa +""" # noqa: E501 ) raise diff --git a/docs/source/anonymisation/_crate_anon_check_text_extractor.txt b/docs/source/anonymisation/_crate_anon_check_text_extractor.txt index 61c1b6e7..38082a67 100644 --- a/docs/source/anonymisation/_crate_anon_check_text_extractor.txt +++ b/docs/source/anonymisation/_crate_anon_check_text_extractor.txt @@ -1,7 +1,7 @@ USAGE: crate_anon_check_text_extractor [-h] [checkextractor ...] Check availability of tools to extract text from different document formats. -(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.) +(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) POSITIONAL ARGUMENTS: checkextractor File extensions to check for availability of a text diff --git a/docs/source/anonymisation/_crate_anon_demo_config_help.txt b/docs/source/anonymisation/_crate_anon_demo_config_help.txt index 22741d16..296b24b3 100644 --- a/docs/source/anonymisation/_crate_anon_demo_config_help.txt +++ b/docs/source/anonymisation/_crate_anon_demo_config_help.txt @@ -1,6 +1,6 @@ USAGE: crate_anon_demo_config [-h] [--output OUTPUT] [--leave_placeholders] -Print a demo config file for the CRATE anonymiser. (CRATE version 0.20.5, +Print a demo config file for the CRATE anonymiser. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: diff --git a/docs/source/anonymisation/_crate_anon_draft_dd.txt b/docs/source/anonymisation/_crate_anon_draft_dd.txt index dac55775..ece98795 100644 --- a/docs/source/anonymisation/_crate_anon_draft_dd.txt +++ b/docs/source/anonymisation/_crate_anon_draft_dd.txt @@ -11,7 +11,7 @@ USAGE: crate_anon_draft_dd [-h] [--config CONFIG] [--verbose] [--incremental] [--systmone_no_table_info_in_comments] Draft a data dictionary for the anonymiser, by scanning a source database. -(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.) +(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: -h, --help show this help message and exit diff --git a/docs/source/anonymisation/_crate_anon_show_counts_help.txt b/docs/source/anonymisation/_crate_anon_show_counts_help.txt index 1c2f2524..8cb66d00 100644 --- a/docs/source/anonymisation/_crate_anon_show_counts_help.txt +++ b/docs/source/anonymisation/_crate_anon_show_counts_help.txt @@ -1,6 +1,6 @@ USAGE: crate_anon_show_counts [-h] [--config CONFIG] [--verbose] -Print record counts from source/destination databases. (CRATE version 0.20.5, +Print record counts from source/destination databases. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: diff --git a/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt b/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt index c54f9a65..e26a859d 100644 --- a/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt +++ b/docs/source/anonymisation/_crate_anon_summarize_dd_help.txt @@ -2,7 +2,7 @@ USAGE: crate_anon_summarize_dd [-h] [--config CONFIG] [--verbose] [--output OUTPUT] Summarize a data dictionary for the anonymiser. The resulting -spreadsheet-style report has one row per source table. (CRATE version 0.20.5, +spreadsheet-style report has one row per source table. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: diff --git a/docs/source/anonymisation/_crate_anonymise_help.txt b/docs/source/anonymisation/_crate_anonymise_help.txt index 89a09bc1..5a2dbfa8 100644 --- a/docs/source/anonymisation/_crate_anonymise_help.txt +++ b/docs/source/anonymisation/_crate_anonymise_help.txt @@ -11,7 +11,7 @@ USAGE: crate_anonymise [-h] [--config CONFIG] [--version] [--verbose] [--reportevery [REPORTEVERY]] [--debugscrubbers] [--savescrubbers] [--echo] -Database anonymiser. (CRATE version 0.20.5, 2024-06-26. Created by Rudolf +Database anonymiser. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: diff --git a/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt b/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt index 032c7fe8..7aa275da 100644 --- a/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt +++ b/docs/source/anonymisation/_crate_anonymise_multiprocess_help.txt @@ -1,6 +1,6 @@ USAGE: crate_anonymise_multiprocess [-h] [--nproc [NPROC]] [--verbose] -Runs the CRATE anonymiser in parallel. Version 0.20.5 (2024-06-26). Note that +Runs the CRATE anonymiser in parallel. Version 0.20.6 (2024-06-26). Note that all arguments not specified here are passed to the underlying script (see crate_anonymise --help). diff --git a/docs/source/anonymisation/_crate_researcher_report_help.txt b/docs/source/anonymisation/_crate_researcher_report_help.txt index b7af53f9..bac25a69 100644 --- a/docs/source/anonymisation/_crate_researcher_report_help.txt +++ b/docs/source/anonymisation/_crate_researcher_report_help.txt @@ -16,7 +16,7 @@ USAGE: crate_researcher_report [-h] [--config CONFIG] [--noconfig] output Produce a researcher-oriented PDF report about a destination database. -(CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.) +(CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) Note: if wkhtmtopdf reports 'Too many open files', see - https://stackoverflow.com/q/25355697; diff --git a/docs/source/anonymisation/_crate_subset_db_help.txt b/docs/source/anonymisation/_crate_subset_db_help.txt index 72c9175b..51dbd3ad 100644 --- a/docs/source/anonymisation/_crate_subset_db_help.txt +++ b/docs/source/anonymisation/_crate_subset_db_help.txt @@ -12,7 +12,7 @@ USAGE: crate_subset_db [-h] --src_db_url SRC_DB_URL --dst_db_url DST_DB_URL [--exclude_table_filenames [EXCLUDE_TABLE_FILENAMES ...]] [--verbose] [--echo] -Create a simple subset of a database, copying one database to another while applying filters. You can filter by a standard column (e.g. one representing patient IDs), taking permitted filter values from the command line, from file(s), and/or from database(s). You can also decide which tables to include/exclude. (CRATE version 0.20.5, 2024-06-26. Created by Rudolf Cardinal.) +Create a simple subset of a database, copying one database to another while applying filters. You can filter by a standard column (e.g. one representing patient IDs), taking permitted filter values from the command line, from file(s), and/or from database(s). You can also decide which tables to include/exclude. (CRATE version 0.20.6, 2024-06-26. Created by Rudolf Cardinal.) OPTIONS: -h, --help show this help message and exit diff --git a/docs/source/anonymisation/_specimen_anonymiser_config.ini b/docs/source/anonymisation/_specimen_anonymiser_config.ini index 352110af..80f5da6d 100644 --- a/docs/source/anonymisation/_specimen_anonymiser_config.ini +++ b/docs/source/anonymisation/_specimen_anonymiser_config.ini @@ -1,5 +1,5 @@ # Configuration file for CRATE anonymiser (crate_anonymise). -# Version 0.20.5 (2024-06-26). +# Version 0.20.6 (2024-06-26). # # SEE HELP FOR DETAILS. diff --git a/docs/source/autodoc/nlp_manager/_index.rst b/docs/source/autodoc/nlp_manager/_index.rst index b9b0bc87..519a5e45 100644 --- a/docs/source/autodoc/nlp_manager/_index.rst +++ b/docs/source/autodoc/nlp_manager/_index.rst @@ -57,6 +57,7 @@ crate_anon/nlp_manager parse_medex.py.rst parse_substance_misuse.py.rst prepare_umls_for_bioyodie.py.rst + processor_helpers.py.rst regex_func.py.rst regex_numbers.py.rst regex_parser.py.rst diff --git a/docs/source/autodoc/nlp_manager/processor_helpers.py.rst b/docs/source/autodoc/nlp_manager/processor_helpers.py.rst new file mode 100644 index 00000000..7439256f --- /dev/null +++ b/docs/source/autodoc/nlp_manager/processor_helpers.py.rst @@ -0,0 +1,27 @@ +.. docs/source/autodoc/nlp_manager/processor_helpers.py.rst + +.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. + +.. Copyright (C) 2015, University of Cambridge, Department of Psychiatry. + Created by Rudolf Cardinal (rnc1001@cam.ac.uk). + . + This file is part of CRATE. + . + CRATE is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + . + CRATE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with CRATE. If not, see . + +crate_anon.nlp_manager.processor_helpers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: crate_anon.nlp_manager.processor_helpers + :members: diff --git a/docs/source/autodoc_extra/_command_line_index.rst b/docs/source/autodoc_extra/_command_line_index.rst index 80e74e2d..3a5d15da 100644 --- a/docs/source/autodoc_extra/_command_line_index.rst +++ b/docs/source/autodoc_extra/_command_line_index.rst @@ -127,4 +127,4 @@ Index of CRATE commands :ref:`crate_windows_service ` -(Documentation built with CRATE 0.20.5.) +(Documentation built with CRATE 0.20.6.) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2b61b518..6b695569 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1692,6 +1692,11 @@ Changes characters in the source database. Convert any unusual characters to ASCII in the destination database. +- Update NLP handler to cope with remote NLPRP servers providing tabular_schema + data, and create local tables based on this, if desired. Change default + NLPRP server behaviour to use the more explicit format (object/dictionaries + by table, not plain arrays/lists of records). + To do ----- diff --git a/docs/source/misc/technical_notes.rst b/docs/source/misc/technical_notes.rst index 2b67407b..09698435 100644 --- a/docs/source/misc/technical_notes.rst +++ b/docs/source/misc/technical_notes.rst @@ -759,7 +759,7 @@ webnotes.txt - ... or could validate manually with a form, e.g. form = MyForm(request.GET, extraparam) using the style at - https://stackoverflow.com/questions/18769607/django-form-with-customer-parameter-and-validation-not-getting-clean-function # noqa + https://stackoverflow.com/questions/18769607/django-form-with-customer-parameter-and-validation-not-getting-clean-function # noqa: E501 2. Query parameters - can encode using urllib, e.g. diff --git a/docs/source/nlp/_crate_nlp_help.txt b/docs/source/nlp/_crate_nlp_help.txt index 4a95e899..786c52b5 100644 --- a/docs/source/nlp/_crate_nlp_help.txt +++ b/docs/source/nlp/_crate_nlp_help.txt @@ -9,7 +9,7 @@ USAGE: crate_nlp [-h] [--config CONFIG] [--nlpdef NLPDEF] [-i | -f] [--count] [--cloud] [--immediate] [--retrieve] [--cancelrequest] [--cancelall] [--showqueue] -NLP manager. Version 0.20.5 (2024-06-26). Created by Rudolf Cardinal. +NLP manager. Version 0.20.6 (2024-06-26). Created by Rudolf Cardinal. OPTIONS: -h, --help show this help message and exit diff --git a/docs/source/nlp/_crate_nlp_multiprocess_help.txt b/docs/source/nlp/_crate_nlp_multiprocess_help.txt index e4eaaa23..de6d4982 100644 --- a/docs/source/nlp/_crate_nlp_multiprocess_help.txt +++ b/docs/source/nlp/_crate_nlp_multiprocess_help.txt @@ -1,7 +1,7 @@ USAGE: crate_nlp_multiprocess [-h] --nlpdef NLPDEF [--nproc [NPROC]] [--verbose] -Runs the CRATE NLP manager in parallel. Version 0.20.5 (2024-06-26). Note that +Runs the CRATE NLP manager in parallel. Version 0.20.6 (2024-06-26). Note that all arguments not specified here are passed to the underlying script (see crate_nlp --help). diff --git a/docs/source/nlp/_nlprp_test_client.py b/docs/source/nlp/_nlprp_test_client.py index 3b6aeaa1..bc549569 100755 --- a/docs/source/nlp/_nlprp_test_client.py +++ b/docs/source/nlp/_nlprp_test_client.py @@ -110,7 +110,7 @@ def fail(msg: str) -> NoReturn: # "deflate" transfer-encodings from the server; see # http://docs.python-requests.org/en/master/user/quickstart/. # - The difference between gzip and deflate: - # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa + # https://stackoverflow.com/questions/388595/why-use-deflate-instead-of-gzip-for-text-files-served-by-apache # noqa: E501 log.debug( f"Reply has status_code={r.status_code}, headers={r.headers!r}, " f"text={r.text!r}" @@ -120,7 +120,7 @@ def fail(msg: str) -> NoReturn: # "r.text" automatically does gzip decode except ( ValueError - ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa + ): # includes simplejson.errors.JSONDecodeError, json.decoder.JSONDecodeError # noqa: E501 fail("Reply was not JSON") response = None # for type checker log.debug(f"Response JSON decoded to: {response.dict!r}") diff --git a/docs/source/nlp/_specimen_nlp_config_file.ini b/docs/source/nlp/_specimen_nlp_config_file.ini index fd1888cf..446634c6 100644 --- a/docs/source/nlp/_specimen_nlp_config_file.ini +++ b/docs/source/nlp/_specimen_nlp_config_file.ini @@ -1,5 +1,5 @@ # Configuration file for CRATE NLP manager (crate_nlp). -# Version 0.20.5 (2024-06-26). +# Version 0.20.6 (2024-06-26). # # PLEASE SEE THE HELP at https://crateanon.readthedocs.io/ # Using defaults for Docker environment: False diff --git a/docs/source/nlp/nlp_config.rst b/docs/source/nlp/nlp_config.rst index 512b7ea2..99f945a3 100644 --- a/docs/source/nlp/nlp_config.rst +++ b/docs/source/nlp/nlp_config.rst @@ -218,7 +218,9 @@ and CRATE would then look for a :ref:`processor definition ``[processor:mygateproc_name_location]``, and expect it to have the information required for a GATE processor. -For possible processor types, see ``crate_nlp --listprocessors``. +For possible processor types, see ``crate_nlp --listprocessors``. These include +CRATE internal processors (e.g. "Glucose"), external tools run locally (e.g. +"GATE"), and cloud-based NLP ("Cloud"). progressdb @@ -456,33 +458,6 @@ Destination database; the name of a :ref:`database definition ` in the config file. -desttable -######### - -*String.* - -**Applicable to: Cloud, MedEx, all CRATE Python processors.** - -The name of the table in the destination database in which the results should -be stored. - - -assume_preferred_unit -##################### - -*Boolean.* Default: True. - -**Applicable to: nearly all numerical CRATE Python processors.** - -If a unit is not specified, assume that values are in the processor's preferred -units. (For example, :class:`crate_anon.nlp_manager.parse_biochemistry.Crp` -will assume mg/L.) - -Some override this and are not configurable, however: - -- ``AlcoholUnits`` never assumes this. - - .. _nlp_config_processor_desttable: desttable @@ -490,12 +465,15 @@ desttable *String.* -**Applicable to: Cloud.** +**Applicable to: MedEx, all CRATE Python processors.** -Table name in the destination (NLP output) database into which to write results -from the cloud NLP processor. Use this for single-table processors. +The name of the table in the destination database in which the results should +be stored. -The alternative is :ref:`outputtypemap `. +**Cloud** and local **GATE** processors may produce output for multiple tables +(or a single table, but potentially one that you need to help define). For +these, use :ref:`outputtypemap ` instead. +This refers to "output" configurations, in which you can define the table(s). .. _nlp_config_processor_outputtypemap: @@ -507,19 +485,21 @@ outputtypemap **Applicable to: GATE, Cloud.** -For GATE: +For GATE processors: What's GATE? See the section on :ref:`GATE NLP `. - Map GATE '_type' parameters to possible destination tables (in - case-insensitive fashion). This parameter is follows is a list of pairs, - one pair per line. + This tabular entry maps GATE '_type' parameters to possible destination + tables (in case-insensitive fashion). This parameter is follows is a list + of pairs, one pair per line. - The first item of each is the annotation type coming out of the GATE system. - The second is the output type section defined in this file (as a separate - section). Those sections (q.v.) define tables and columns (fields). + section). Those sections each define a table with its columns (fields); + see :ref:`GATE/cloud output definitions + `. Example: @@ -531,15 +511,40 @@ For GATE: This example would take output from GATE labelled with ``_type=Person`` and send it to output defined in the ``[output:output_person]`` section of the - config file -- see :ref:`GATE output definitions - `. Equivalently for the ``Location`` type. + config file -- see :ref:`GATE/cloud output definitions + `. Equivalently for the ``Location`` + type. + +For cloud processors: + + - The first parameter is the remote server's tablename (see :ref:`NLPRP + schema definition `). The second is an output + type definition, as above (which may define the table in full, or just + name it and leave the definition to the remote processor). + + - Use this method whenever the remote processor may return data for more + than one table. + + - If the remote processor will only return results for a single table, and + doesn't name it, the FIRST definition in the output type map is used + (and the first element of the pair is ignored for this purpose, i.e. + you can use any string you want). + + +assume_preferred_unit +##################### + +*Boolean.* Default: True. -For cloud: +**Applicable to: nearly all numerical CRATE Python processors.** - - The alternative is :ref:`desttable `. +If a unit is not specified, assume that values are in the processor's preferred +units. (For example, :class:`crate_anon.nlp_manager.parse_biochemistry.Crp` +will assume mg/L.) - - If both are present, only :ref:`outputtypemap - ` will be used. +Some override this and are not configurable, however: + +- ``AlcoholUnits`` never assumes this. .. _nlp_config_section_gate_progargs: @@ -739,17 +744,25 @@ same format as the CRATE processors. ``GATE`` refers to GATE remote processors, which return a standard set of columns. -.. _nlp_config_section_gate_output: +.. _nlp_config_section_gate_cloud_output: -Config file section: GATE output definition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Config file section: GATE/cloud output definition +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ These are config file sections named ``[output:XXX]`` where ``XXX`` is the -name of one of your GATE output types [#outputuserconfig]_. +name of one of your GATE output types, or cloud remote processor table +names.[#outputuserconfig]_ + +For GATE applications, we need this additional information because CRATE +doesn't automatically know what sort of output they will produce. The tables +and SPECIFIC output fields for a given GATE processor are defined here. -This is an additional thing we need for GATE applications, since CRATE doesn't -automatically know what sort of output they will produce. The tables and -SPECIFIC output fields for a given GATE processor are defined here. +For remote cloud processors, this section enables you to rename remote tables +to something appropriate locally, and add options (like indexing). +Additionally, CRATE may or may not be told exactly by the remote application +what tabular structure it is using, but even if the remote application is +helpfully informative, you wouldn't automatically trust remotely provided table +names. So this section is still mandatory. They are referred to by the :ref:`outputtypemap ` parameter (q.v.). @@ -761,7 +774,7 @@ desttable *String.* Table name in the destination (NLP output) database into which to write results -from the GATE NLP application. +from the GATE/cloud NLP application. renames @@ -769,9 +782,15 @@ renames *Multiline string.* -A list of ``from, to`` things to rename from the GATE output en route to the -database. In each case, the ``from`` item is the name of a GATE output -annotation. The ``to`` item is the destination field/column name. +This is an optional "column renaming" section. + +For GATE processors: a list of ``from, to`` things to rename from the GATE +output en route to the database. In each case, the ``from`` item is the name of +a GATE output annotation. The ``to`` item is the destination field/column name. + +Also applicable to cloud processors; you can rename columns in this way. The +``from`` item is the column name as specified by the remote processor, and the +``to`` item the local destination column name. Specify one pair per line. You can can quote, using shlex_ rules. Case-sensitive. @@ -807,6 +826,8 @@ null_literals *Multiline string.* +**Applicable to: GATE only.** + Define values that will be treated as ``NULL`` in SQL. For example, sometimes GATE provides the string ``null`` for a NULL value; we can convert to a proper SQL NULL. @@ -853,6 +874,11 @@ separate the columns. Examples: gender VARCHAR(7) Gender (e.g. male, female, unknown) kind VARCHAR(100) Kind of name (e.g. personName, fullName) +For cloud applications, this is optional. If you specify **any** lines here, +your table will be created in this way (plus additional universal CRATE NLP +columns). If you don't specify any, it will be created according to the remote +table specification (plus additional universal CRATE NLP columns). + indexdefs ######### diff --git a/docs/source/website_config/_specimen_web_config.py b/docs/source/website_config/_specimen_web_config.py index 23b30d90..3fd48f17 100644 --- a/docs/source/website_config/_specimen_web_config.py +++ b/docs/source/website_config/_specimen_web_config.py @@ -35,21 +35,21 @@ # ============================================================================= # Site URL configuration # ============================================================================= -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 -# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa -# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa +# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" # example for Apache # noqa: E501 +# DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://localhost:8000" # for the Django dev server # noqa: E501 DJANGO_SITE_ROOT_ABSOLUTE_URL = "http://mymachine.mydomain" FORCE_SCRIPT_NAME = "" # FORCE_SCRIPT_NAME = "" # example for Apache root hosting -# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa +# FORCE_SCRIPT_NAME = "/crate" # example for CherryPy or Apache non-root hosting # noqa: E501 # ============================================================================= # Site security # ============================================================================= -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = "aaaaaaaaaaaaaaaaaa CHANGE THIS! aaaaaaaaaaaaaaaaaa" @@ -358,12 +358,12 @@ def always_show_toolbar(request: "HttpRequest") -> bool: EMAIL_USE_SSL = False # Who will the e-mails appear to come from? -EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa +EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501 # ----------------------------------------------------------------------------- # Additional settings # ----------------------------------------------------------------------------- -# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa +# See https://crateanon.readthedocs.io/en/latest/website_config/web_config_file.html # noqa: E501 # During development, we route all consent-related e-mails to the developer. # Switch SAFETY_CATCH_ON to False for production mode. diff --git a/docs/source/website_config/web_config_file.rst b/docs/source/website_config/web_config_file.rst index a3950b8a..ed4b57b4 100644 --- a/docs/source/website_config/web_config_file.rst +++ b/docs/source/website_config/web_config_file.rst @@ -704,7 +704,7 @@ https://docs.djangoproject.com/en/1.8/ref/settings/#email-backend. Example: EMAIL_USE_SSL = False # Who will the e-mails appear to come from? - EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa + EMAIL_SENDER = "My NHS Trust Research Database - DO NOT REPLY " # noqa: E501 Then there are some additional custom settings: diff --git a/tools/make_package.py b/tools/make_package.py index 5d4982a7..23821424 100755 --- a/tools/make_package.py +++ b/tools/make_package.py @@ -347,8 +347,8 @@ def workpath(destpath: str, *args: str) -> str: f"adduser --system --ingroup {CRATE_GROUP} {CRATE_USER}" ) # MAKE_USER_COMMAND_2 = f"adduser --ingroup {CRATE_GROUP} {CRATE_USER}" - # https://lintian.debian.org/tags/maintainer-script-should-not-use-adduser-system-without-home.html # noqa - # http://unix.stackexchange.com/questions/47880/how-debian-package-should-create-user-accounts # noqa + # https://lintian.debian.org/tags/maintainer-script-should-not-use-adduser-system-without-home.html # noqa: E501 + # http://unix.stackexchange.com/questions/47880/how-debian-package-should-create-user-accounts # noqa: E501 else: MAKE_USER_COMMAND_1 = "# No need to add user" MAKE_USER_COMMAND_2 = "" @@ -514,7 +514,7 @@ def workpath(destpath: str, *args: str) -> str: f""" # Not an official new Debian package, so ignore this one. # If we did want to close a new-package ITP bug: -# http://www.debian.org/doc/manuals/developers-reference/pkgs.html#upload-bugfix # noqa +# http://www.debian.org/doc/manuals/developers-reference/pkgs.html#upload-bugfix # noqa: E501 {PACKAGE_FOR_DEB} binary: new-package-should-close-itp-bug {PACKAGE_FOR_DEB} binary: extra-license-file {PACKAGE_FOR_DEB} binary: embedded-javascript-library