From 183d0e58843d2b0c366d8428e1f67772d4f3f51f Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 24 Jan 2024 13:19:10 +0000 Subject: [PATCH 1/2] Add API calls to get document parse information --- CHANGELOG.md | 2 ++ src/caselawclient/Client.py | 32 +++++++++++++++++++ .../xquery/get_highest_parser_version.xqy | 9 ++++++ .../xquery/get_pending_parse_for_version.xqy | 28 ++++++++++++++++ src/caselawclient/xquery_type_dicts.py | 6 ++++ 5 files changed, 77 insertions(+) create mode 100644 src/caselawclient/xquery/get_highest_parser_version.xqy create mode 100644 src/caselawclient/xquery/get_pending_parse_for_version.xqy diff --git a/CHANGELOG.md b/CHANGELOG.md index 21c4618d..ddecd3e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog 1.0.0]. ## Unreleased +- **Feature:** New `Client.get_pending_parse_for_version` and `Client.get_highest_parser_version` methods to help find documents in need of re-parsing. + ## [Release 19.1.0] - Add support for quoted phrase prioritisation in result snippets diff --git a/src/caselawclient/Client.py b/src/caselawclient/Client.py index 109c5d73..a7a4e4c7 100644 --- a/src/caselawclient/Client.py +++ b/src/caselawclient/Client.py @@ -984,3 +984,35 @@ def get_pending_enrichment_for_version( ) return results + + def get_highest_parser_version(self) -> tuple[int, int]: + """This gets the highest parser version in the database, so if nothing has been parsed with the most recent version of the parser, this won't reflect that change.""" + table = json.loads( + get_single_string_from_marklogic_response( + self._send_to_eval( + {}, + "get_highest_parser_version.xqy", + ) + ) + ) + + return (int(table[1][1]), int(table[1][2])) + + def get_pending_parse_for_version( + self, target_version: tuple[int, int] + ) -> list[list[Any]]: + """Retrieve documents which are not yet parsed with a given version.""" + vars: query_dicts.GetPendingParseForVersionDict = { + "target_major_version": target_version[0], + "target_minor_version": target_version[1], + } + results: list[list[Any]] = json.loads( + get_single_string_from_marklogic_response( + self._send_to_eval( + vars, + "get_pending_parse_for_version.xqy", + ) + ) + ) + + return results diff --git a/src/caselawclient/xquery/get_highest_parser_version.xqy b/src/caselawclient/xquery/get_highest_parser_version.xqy new file mode 100644 index 00000000..dfe8c444 --- /dev/null +++ b/src/caselawclient/xquery/get_highest_parser_version.xqy @@ -0,0 +1,9 @@ +xquery version "1.0-ml"; + +xdmp:to-json(xdmp:sql( + "SELECT parser_version_string, parser_major_version, parser_minor_version + FROM documents.process_data + ORDER BY parser_major_version DESC, parser_minor_version DESC + LIMIT 1", + "array" +)) diff --git a/src/caselawclient/xquery/get_pending_parse_for_version.xqy b/src/caselawclient/xquery/get_pending_parse_for_version.xqy new file mode 100644 index 00000000..35eaa61e --- /dev/null +++ b/src/caselawclient/xquery/get_pending_parse_for_version.xqy @@ -0,0 +1,28 @@ +xquery version "1.0-ml"; + +declare variable $target_major_version as xs:int external; +declare variable $target_minor_version as xs:int external; + +xdmp:to-json(xdmp:sql( + "SELECT process_data.uri, parser_version_string, minutes_since_parse_request + FROM ( + SELECT + process_data.uri, + parser_version_string, parser_major_version, parser_minor_version, + DATEDIFF('minute', last_sent_to_parser, CURRENT_TIMESTAMP) AS minutes_since_parse_request + FROM documents.process_data + JOIN documents.process_property_data ON process_data.uri = process_property_data.uri + ) + WHERE ( + (parser_version_string IS NULL) OR + (parser_major_version <= @target_major_version AND parser_minor_version < @target_minor_version) + ) + AND (minutes_since_parse_request > 43200 OR minutes_since_parse_request IS NULL) + ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC", + "array", + map:new(( + map:entry("target_major_version", $target_major_version), + map:entry("target_minor_version", $target_minor_version) + )) +)) + diff --git a/src/caselawclient/xquery_type_dicts.py b/src/caselawclient/xquery_type_dicts.py index c6a9372e..9d7e8f41 100644 --- a/src/caselawclient/xquery_type_dicts.py +++ b/src/caselawclient/xquery_type_dicts.py @@ -83,6 +83,12 @@ class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict): target_version: int +# get_pending_parse_for_version.xqy +class GetPendingParseForVersionDict(MarkLogicAPIDict): + target_major_version: int + target_minor_version: int + + # get_properties_for_search_results.xqy class GetPropertiesForSearchResultsDict(MarkLogicAPIDict): uris: list[Any] From 05cbeb241370190d09ff285ad9d00b4196b10851 Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 24 Jan 2024 14:15:40 +0000 Subject: [PATCH 2/2] Re-enrichment now uses minor version, not just major version --- CHANGELOG.md | 1 + src/caselawclient/Client.py | 9 +++++---- .../xquery/get_highest_enrichment_version.xqy | 4 ++-- .../get_pending_enrichment_for_version.xqy | 20 ++++++++++++++----- src/caselawclient/xquery_type_dicts.py | 3 ++- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddecd3e8..287b7349 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog 1.0.0]. ## Unreleased - **Feature:** New `Client.get_pending_parse_for_version` and `Client.get_highest_parser_version` methods to help find documents in need of re-parsing. +- **Breaking:** `Client.get_pending_enrichment_for_version` now accepts a tuple of `(major_version, minor_version)` rather than a single major version. ## [Release 19.1.0] diff --git a/src/caselawclient/Client.py b/src/caselawclient/Client.py index a7a4e4c7..82672c94 100644 --- a/src/caselawclient/Client.py +++ b/src/caselawclient/Client.py @@ -952,7 +952,7 @@ def get_combined_stats_table(self) -> list[list[Any]]: return results - def get_highest_enrichment_version(self) -> int: + def get_highest_enrichment_version(self) -> tuple[int, int]: """This gets the highest enrichment version in the database, so if nothing has been enriched with the most recent version of enrichment, this won't reflect that change.""" @@ -965,14 +965,15 @@ def get_highest_enrichment_version(self) -> int: ) ) - return int(table[1][1]) + return (int(table[1][1]), int(table[1][2])) def get_pending_enrichment_for_version( - self, target_version: int + self, target_version: tuple[int, int] ) -> list[list[Any]]: """Retrieve documents which are not yet enriched with a given version.""" vars: query_dicts.GetPendingEnrichmentForVersionDict = { - "target_version": target_version + "target_major_version": target_version[0], + "target_minor_version": target_version[1], } results: list[list[Any]] = json.loads( get_single_string_from_marklogic_response( diff --git a/src/caselawclient/xquery/get_highest_enrichment_version.xqy b/src/caselawclient/xquery/get_highest_enrichment_version.xqy index 45b5c1eb..30a50291 100644 --- a/src/caselawclient/xquery/get_highest_enrichment_version.xqy +++ b/src/caselawclient/xquery/get_highest_enrichment_version.xqy @@ -1,9 +1,9 @@ xquery version "1.0-ml"; xdmp:to-json(xdmp:sql( - "SELECT enrich_version_string, enrich_major_version + "SELECT enrich_version_string, enrich_major_version, enrich_minor_version FROM documents.process_data - ORDER BY enrich_major_version DESC + ORDER BY enrich_major_version DESC, enrich_minor_version DESC LIMIT 1", "array" )) diff --git a/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy b/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy index 0ddbb0d5..6a58eacc 100644 --- a/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy +++ b/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy @@ -1,18 +1,28 @@ xquery version "1.0-ml"; -declare variable $target_version as xs:int external; +declare variable $target_major_version as xs:int external; +declare variable $target_minor_version as xs:int external; xdmp:to-json(xdmp:sql( "SELECT process_data.uri, enrich_version_string, minutes_since_enrichment_request FROM ( - SELECT process_data.uri, enrich_version_string, enrich_major_version, DATEDIFF('minute', last_sent_to_enrichment, CURRENT_TIMESTAMP) AS minutes_since_enrichment_request + SELECT + process_data.uri, + enrich_version_string, enrich_major_version, enrich_minor_version, + DATEDIFF('minute', last_sent_to_enrichment, CURRENT_TIMESTAMP) AS minutes_since_enrichment_request FROM documents.process_data JOIN documents.process_property_data ON process_data.uri = process_property_data.uri ) - WHERE ((enrich_version_string IS NULL) OR (enrich_major_version < @target_version)) + WHERE ( + (enrich_version_string IS NULL) OR + (enrich_major_version <= @target_major_version AND enrich_minor_version < @target_minor_version) + ) AND (minutes_since_enrichment_request > 43200 OR minutes_since_enrichment_request IS NULL) - ORDER BY enrich_major_version ASC NULLS FIRST", + ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC", "array", - map:new(map:entry("target_version", $target_version)) + map:new(( + map:entry("target_major_version", $target_major_version), + map:entry("target_minor_version", $target_minor_version) + )) )) diff --git a/src/caselawclient/xquery_type_dicts.py b/src/caselawclient/xquery_type_dicts.py index 9d7e8f41..b4df2b15 100644 --- a/src/caselawclient/xquery_type_dicts.py +++ b/src/caselawclient/xquery_type_dicts.py @@ -80,7 +80,8 @@ class GetLastModifiedDict(MarkLogicAPIDict): # get_pending_enrichment_for_version.xqy class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict): - target_version: int + target_major_version: int + target_minor_version: int # get_pending_parse_for_version.xqy