Skip to content

Commit

Permalink
Merge pull request #531 from nationalarchives/feature/awaiting-repars…
Browse files Browse the repository at this point in the history
…e-report

Add API calls to get document parse information
  • Loading branch information
jacksonj04 authored Jan 24, 2024
2 parents 21fd463 + 05cbeb2 commit a37fab3
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog 1.0.0].

## Unreleased

- **Feature:** New `Client.get_pending_parse_for_version` and `Client.get_highest_parser_version` methods to help find documents in need of re-parsing.
- **Breaking:** `Client.get_pending_enrichment_for_version` now accepts a tuple of `(major_version, minor_version)` rather than a single major version.

## [Release 19.1.0]

- Add support for quoted phrase prioritisation in result snippets
Expand Down
41 changes: 37 additions & 4 deletions src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ def get_combined_stats_table(self) -> list[list[Any]]:

return results

def get_highest_enrichment_version(self) -> int:
def get_highest_enrichment_version(self) -> tuple[int, int]:
"""This gets the highest enrichment version in the database,
so if nothing has been enriched with the most recent version of enrichment,
this won't reflect that change."""
Expand All @@ -965,14 +965,15 @@ def get_highest_enrichment_version(self) -> int:
)
)

return int(table[1][1])
return (int(table[1][1]), int(table[1][2]))

def get_pending_enrichment_for_version(
self, target_version: int
self, target_version: tuple[int, int]
) -> list[list[Any]]:
"""Retrieve documents which are not yet enriched with a given version."""
vars: query_dicts.GetPendingEnrichmentForVersionDict = {
"target_version": target_version
"target_major_version": target_version[0],
"target_minor_version": target_version[1],
}
results: list[list[Any]] = json.loads(
get_single_string_from_marklogic_response(
Expand All @@ -984,3 +985,35 @@ def get_pending_enrichment_for_version(
)

return results

def get_highest_parser_version(self) -> tuple[int, int]:
"""This gets the highest parser version in the database, so if nothing has been parsed with the most recent version of the parser, this won't reflect that change."""
table = json.loads(
get_single_string_from_marklogic_response(
self._send_to_eval(
{},
"get_highest_parser_version.xqy",
)
)
)

return (int(table[1][1]), int(table[1][2]))

def get_pending_parse_for_version(
self, target_version: tuple[int, int]
) -> list[list[Any]]:
"""Retrieve documents which are not yet parsed with a given version."""
vars: query_dicts.GetPendingParseForVersionDict = {
"target_major_version": target_version[0],
"target_minor_version": target_version[1],
}
results: list[list[Any]] = json.loads(
get_single_string_from_marklogic_response(
self._send_to_eval(
vars,
"get_pending_parse_for_version.xqy",
)
)
)

return results
4 changes: 2 additions & 2 deletions src/caselawclient/xquery/get_highest_enrichment_version.xqy
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
xquery version "1.0-ml";

xdmp:to-json(xdmp:sql(
"SELECT enrich_version_string, enrich_major_version
"SELECT enrich_version_string, enrich_major_version, enrich_minor_version
FROM documents.process_data
ORDER BY enrich_major_version DESC
ORDER BY enrich_major_version DESC, enrich_minor_version DESC
LIMIT 1",
"array"
))
9 changes: 9 additions & 0 deletions src/caselawclient/xquery/get_highest_parser_version.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
xquery version "1.0-ml";

xdmp:to-json(xdmp:sql(
"SELECT parser_version_string, parser_major_version, parser_minor_version
FROM documents.process_data
ORDER BY parser_major_version DESC, parser_minor_version DESC
LIMIT 1",
"array"
))
20 changes: 15 additions & 5 deletions src/caselawclient/xquery/get_pending_enrichment_for_version.xqy
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
xquery version "1.0-ml";

declare variable $target_version as xs:int external;
declare variable $target_major_version as xs:int external;
declare variable $target_minor_version as xs:int external;

xdmp:to-json(xdmp:sql(
"SELECT process_data.uri, enrich_version_string, minutes_since_enrichment_request
FROM (
SELECT process_data.uri, enrich_version_string, enrich_major_version, DATEDIFF('minute', last_sent_to_enrichment, CURRENT_TIMESTAMP) AS minutes_since_enrichment_request
SELECT
process_data.uri,
enrich_version_string, enrich_major_version, enrich_minor_version,
DATEDIFF('minute', last_sent_to_enrichment, CURRENT_TIMESTAMP) AS minutes_since_enrichment_request
FROM documents.process_data
JOIN documents.process_property_data ON process_data.uri = process_property_data.uri
)
WHERE ((enrich_version_string IS NULL) OR (enrich_major_version < @target_version))
WHERE (
(enrich_version_string IS NULL) OR
(enrich_major_version <= @target_major_version AND enrich_minor_version < @target_minor_version)
)
AND (minutes_since_enrichment_request > 43200 OR minutes_since_enrichment_request IS NULL)
ORDER BY enrich_major_version ASC NULLS FIRST",
ORDER BY enrich_major_version ASC NULLS FIRST, enrich_minor_version ASC",
"array",
map:new(map:entry("target_version", $target_version))
map:new((
map:entry("target_major_version", $target_major_version),
map:entry("target_minor_version", $target_minor_version)
))
))

28 changes: 28 additions & 0 deletions src/caselawclient/xquery/get_pending_parse_for_version.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
xquery version "1.0-ml";

declare variable $target_major_version as xs:int external;
declare variable $target_minor_version as xs:int external;

xdmp:to-json(xdmp:sql(
"SELECT process_data.uri, parser_version_string, minutes_since_parse_request
FROM (
SELECT
process_data.uri,
parser_version_string, parser_major_version, parser_minor_version,
DATEDIFF('minute', last_sent_to_parser, CURRENT_TIMESTAMP) AS minutes_since_parse_request
FROM documents.process_data
JOIN documents.process_property_data ON process_data.uri = process_property_data.uri
)
WHERE (
(parser_version_string IS NULL) OR
(parser_major_version <= @target_major_version AND parser_minor_version < @target_minor_version)
)
AND (minutes_since_parse_request > 43200 OR minutes_since_parse_request IS NULL)
ORDER BY parser_major_version ASC NULLS FIRST, parser_minor_version ASC",
"array",
map:new((
map:entry("target_major_version", $target_major_version),
map:entry("target_minor_version", $target_minor_version)
))
))

9 changes: 8 additions & 1 deletion src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,14 @@ class GetLastModifiedDict(MarkLogicAPIDict):

# get_pending_enrichment_for_version.xqy
class GetPendingEnrichmentForVersionDict(MarkLogicAPIDict):
target_version: int
target_major_version: int
target_minor_version: int


# get_pending_parse_for_version.xqy
class GetPendingParseForVersionDict(MarkLogicAPIDict):
target_major_version: int
target_minor_version: int


# get_properties_for_search_results.xqy
Expand Down

0 comments on commit a37fab3

Please sign in to comment.