From 83406dce626b5897121afdab6c1d7558f4ffafd3 Mon Sep 17 00:00:00 2001 From: David McKee Date: Wed, 30 Oct 2024 11:45:13 +0000 Subject: [PATCH 1/2] build_xquery_type_dicts allows assignment of default variables --- script/build_xquery_type_dicts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/build_xquery_type_dicts b/script/build_xquery_type_dicts index aa3ed244..98edd49e 100755 --- a/script/build_xquery_type_dicts +++ b/script/build_xquery_type_dicts @@ -34,7 +34,7 @@ ML_TYPES_TO_PYTHON_TYPES_DICT = { } XQY_VARIABLE_DECLARATION_REGEX = re.compile( - r"\s*declare variable \$(.+) as (.+) external;".replace(" ", "\\s+"), + r"\s*declare variable \$(.+) as (.+) external".replace(" ", "\\s+"), re.IGNORECASE | re.MULTILINE, ) From 834a938ad8a16388ec626eb47e192a8862cffbc9 Mon Sep 17 00:00:00 2001 From: David McKee Date: Wed, 30 Oct 2024 11:45:54 +0000 Subject: [PATCH 2/2] Set default limit for reparse/enrich results to 1000, customisable --- CHANGELOG.md | 1 + src/caselawclient/Client.py | 4 ++++ .../xquery/get_pending_enrichment_for_version.xqy | 9 +++++++-- .../xquery/get_pending_parse_for_version.xqy | 11 +++++++---- src/caselawclient/xquery_type_dicts.py | 2 ++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b06c430a..3da61138 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog 1.0.0]. ### Feat - **FCL-386**: search query can now be passed to get_document_by_uri +- **FCL-318**: Allow setting a limit on the number of enrich/reparse targets returned ## v27.2.0 (2024-10-28) diff --git a/src/caselawclient/Client.py b/src/caselawclient/Client.py index ce881be1..85bbe3c2 100644 --- a/src/caselawclient/Client.py +++ b/src/caselawclient/Client.py @@ -1080,6 +1080,7 @@ def get_pending_enrichment_for_version( self, target_enrichment_version: tuple[int, int], target_parser_version: tuple[int, int], + maximum_records: int = 1000, ) -> list[list[Any]]: """Retrieve documents which are not yet enriched with a given version.""" vars: query_dicts.GetPendingEnrichmentForVersionDict = { @@ -1087,6 +1088,7 @@ def get_pending_enrichment_for_version( "target_enrichment_minor_version": target_enrichment_version[1], "target_parser_major_version": target_parser_version[0], "target_parser_minor_version": target_parser_version[1], + "maximum_records": maximum_records, } results: list[list[Any]] = json.loads( get_single_string_from_marklogic_response( @@ -1115,11 +1117,13 @@ def get_highest_parser_version(self) -> tuple[int, int]: def get_pending_parse_for_version( self, target_version: tuple[int, int], + maximum_records: int = 1000, ) -> list[list[Any]]: """Retrieve documents which are not yet parsed with a given version.""" vars: query_dicts.GetPendingParseForVersionDict = { "target_major_version": target_version[0], "target_minor_version": target_version[1], + "maximum_records": maximum_records, } results: list[list[Any]] = json.loads( get_single_string_from_marklogic_response( diff --git a/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy b/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy index d9ff45cc..dbc666a7 100644 --- a/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy +++ b/src/caselawclient/xquery/get_pending_enrichment_for_version.xqy @@ -1,9 +1,11 @@ xquery version "1.0-ml"; +declare namespace xdmp="http://marklogic.com/xdmp"; declare variable $target_enrichment_major_version as xs:int external; declare variable $target_enrichment_minor_version as xs:int external; declare variable $target_parser_major_version as xs:int external; declare variable $target_parser_minor_version as xs:int external; +declare variable $maximum_records as xs:int? external := 1000; xdmp:to-json(xdmp:sql( "SELECT process_data.uri, enrich_version_string, minutes_since_enrichment_request @@ -23,13 +25,16 @@ xdmp:to-json(xdmp:sql( (parser_major_version = @target_parser_major_version AND parser_minor_version = @target_parser_minor_version) ) AND (minutes_since_enrichment_request > 43200 OR minutes_since_enrichment_request IS NULL) - ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC", + ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC + LIMIT @maximum_records", "array", map:new(( map:entry("target_enrichment_major_version", $target_enrichment_major_version), map:entry("target_enrichment_minor_version", $target_enrichment_minor_version), map:entry("target_parser_major_version", $target_parser_major_version), - map:entry("target_parser_minor_version", $target_parser_minor_version) + map:entry("target_parser_minor_version", $target_parser_minor_version), + map:entry("maximum_records", $maximum_records) + )) )) diff --git a/src/caselawclient/xquery/get_pending_parse_for_version.xqy b/src/caselawclient/xquery/get_pending_parse_for_version.xqy index 35eaa61e..38962510 100644 --- a/src/caselawclient/xquery/get_pending_parse_for_version.xqy +++ b/src/caselawclient/xquery/get_pending_parse_for_version.xqy @@ -2,6 +2,7 @@ xquery version "1.0-ml"; declare variable $target_major_version as xs:int external; declare variable $target_minor_version as xs:int external; +declare variable $maximum_records as xs:int? external := 1000; xdmp:to-json(xdmp:sql( "SELECT process_data.uri, parser_version_string, minutes_since_parse_request @@ -18,11 +19,13 @@ xdmp:to-json(xdmp:sql( (parser_major_version <= @target_major_version AND parser_minor_version < @target_minor_version) ) AND (minutes_since_parse_request > 43200 OR minutes_since_parse_request IS NULL) - ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC", + ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC + LIMIT @maximum_records", "array", map:new(( map:entry("target_major_version", $target_major_version), - map:entry("target_minor_version", $target_minor_version) + map:entry("target_minor_version", $target_minor_version), + map:entry("maximum_records", $maximum_records) + )) -)) - +)) \ No newline at end of file diff --git a/src/caselawclient/xquery_type_dicts.py b/src/caselawclient/xquery_type_dicts.py index 9d1128d4..a47be919 100644 --- a/src/caselawclient/xquery_type_dicts.py +++ b/src/caselawclient/xquery_type_dicts.py @@ -88,6 +88,7 @@ class GetLastModifiedDict(MarkLogicAPIDict): # get_pending_enrichment_for_version.xqy class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict): + maximum_records: Optional[int] target_enrichment_major_version: int target_enrichment_minor_version: int target_parser_major_version: int @@ -96,6 +97,7 @@ class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict): # get_pending_parse_for_version.xqy class GetPendingParseForVersionDict(MarkLogicAPIDict): + maximum_records: Optional[int] target_major_version: int target_minor_version: int