Skip to content

Commit

Permalink
Merge pull request #753 from nationalarchives/FCL-138-set-limits-for-…
Browse files Browse the repository at this point in the history
…reparse-and-bulk-enrich

[FCL 138] Allow setting limits for reparse and bulk enrich
  • Loading branch information
dragon-dxw authored Oct 30, 2024
2 parents 094784c + 834a938 commit 45c99b5
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog 1.0.0].
### Feat

- **FCL-386**: search query can now be passed to get_document_by_uri
- **FCL-318**: Allow setting a limit on the number of enrich/reparse targets returned

## v27.2.0 (2024-10-28)

Expand Down
2 changes: 1 addition & 1 deletion script/build_xquery_type_dicts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ ML_TYPES_TO_PYTHON_TYPES_DICT = {
}

XQY_VARIABLE_DECLARATION_REGEX = re.compile(
r"\s*declare variable \$(.+) as (.+) external;".replace(" ", "\\s+"),
r"\s*declare variable \$(.+) as (.+) external".replace(" ", "\\s+"),
re.IGNORECASE | re.MULTILINE,
)

Expand Down
4 changes: 4 additions & 0 deletions src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,13 +1080,15 @@ def get_pending_enrichment_for_version(
self,
target_enrichment_version: tuple[int, int],
target_parser_version: tuple[int, int],
maximum_records: int = 1000,
) -> list[list[Any]]:
"""Retrieve documents which are not yet enriched with a given version."""
vars: query_dicts.GetPendingEnrichmentForVersionDict = {
"target_enrichment_major_version": target_enrichment_version[0],
"target_enrichment_minor_version": target_enrichment_version[1],
"target_parser_major_version": target_parser_version[0],
"target_parser_minor_version": target_parser_version[1],
"maximum_records": maximum_records,
}
results: list[list[Any]] = json.loads(
get_single_string_from_marklogic_response(
Expand Down Expand Up @@ -1115,11 +1117,13 @@ def get_highest_parser_version(self) -> tuple[int, int]:
def get_pending_parse_for_version(
self,
target_version: tuple[int, int],
maximum_records: int = 1000,
) -> list[list[Any]]:
"""Retrieve documents which are not yet parsed with a given version."""
vars: query_dicts.GetPendingParseForVersionDict = {
"target_major_version": target_version[0],
"target_minor_version": target_version[1],
"maximum_records": maximum_records,
}
results: list[list[Any]] = json.loads(
get_single_string_from_marklogic_response(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
xquery version "1.0-ml";

declare namespace xdmp="http://marklogic.com/xdmp";
declare variable $target_enrichment_major_version as xs:int external;
declare variable $target_enrichment_minor_version as xs:int external;
declare variable $target_parser_major_version as xs:int external;
declare variable $target_parser_minor_version as xs:int external;
declare variable $maximum_records as xs:int? external := 1000;

xdmp:to-json(xdmp:sql(
"SELECT process_data.uri, enrich_version_string, minutes_since_enrichment_request
Expand All @@ -23,13 +25,16 @@ xdmp:to-json(xdmp:sql(
(parser_major_version = @target_parser_major_version AND parser_minor_version = @target_parser_minor_version)
)
AND (minutes_since_enrichment_request > 43200 OR minutes_since_enrichment_request IS NULL)
ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC",
ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC
LIMIT @maximum_records",
"array",
map:new((
map:entry("target_enrichment_major_version", $target_enrichment_major_version),
map:entry("target_enrichment_minor_version", $target_enrichment_minor_version),
map:entry("target_parser_major_version", $target_parser_major_version),
map:entry("target_parser_minor_version", $target_parser_minor_version)
map:entry("target_parser_minor_version", $target_parser_minor_version),
map:entry("maximum_records", $maximum_records)

))
))

11 changes: 7 additions & 4 deletions src/caselawclient/xquery/get_pending_parse_for_version.xqy
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ xquery version "1.0-ml";

declare variable $target_major_version as xs:int external;
declare variable $target_minor_version as xs:int external;
declare variable $maximum_records as xs:int? external := 1000;

xdmp:to-json(xdmp:sql(
"SELECT process_data.uri, parser_version_string, minutes_since_parse_request
Expand All @@ -18,11 +19,13 @@ xdmp:to-json(xdmp:sql(
(parser_major_version <= @target_major_version AND parser_minor_version < @target_minor_version)
)
AND (minutes_since_parse_request > 43200 OR minutes_since_parse_request IS NULL)
ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC",
ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC
LIMIT @maximum_records",
"array",
map:new((
map:entry("target_major_version", $target_major_version),
map:entry("target_minor_version", $target_minor_version)
map:entry("target_minor_version", $target_minor_version),
map:entry("maximum_records", $maximum_records)

))
))

))
2 changes: 2 additions & 0 deletions src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class GetLastModifiedDict(MarkLogicAPIDict):

# get_pending_enrichment_for_version.xqy
class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict):
maximum_records: Optional[int]
target_enrichment_major_version: int
target_enrichment_minor_version: int
target_parser_major_version: int
Expand All @@ -96,6 +97,7 @@ class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict):

# get_pending_parse_for_version.xqy
class GetPendingParseForVersionDict(MarkLogicAPIDict):
maximum_records: Optional[int]
target_major_version: int
target_minor_version: int

Expand Down

0 comments on commit 45c99b5

Please sign in to comment.