-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This forms the basis of bulk data extraction for analysis; two new Template Driven Extraction templates will populate `documents.summary` and `documents.propertysummary` tables, which can be collectively queried using an SQL `JOIN` to extract a table of key information about documents. These are a first pass, and will likely change as we get a better handle on the stats we need to extract. Fields which have `<invalid-values>` set to `ignore` and `<nullable>` set to `true` will create rows with `NULL` values where a value cannot be found. Rows where this isn't the case will be dropped, as they will upset the `JOIN`. | Field | Value | Ignore invalid? | | --- | --- | --- | | `uri` | `xdmp:node-uri(.)` | No | | `ncn` | `sem:iri(//uk:cite//text())` | Yes | | `name` | `sem:iri(//akn:FRBRWork/akn:FRBRname/@value)` | Yes | | Field | Value | Ignore invalid? | | --- | --- | --- | | `uri` | `xdmp:node-uri(.)` | No | | `doc_uri` | `dls:version/dls:document-uri` | Yes | | `version_number` | `dls:version/dls:version-id` | Yes | | `modified` | `prop:last-modified` | Yes | | `published` | `published` | Yes |
- Loading branch information
1 parent
95e45e7
commit 9160b75
Showing
2 changed files
with
110 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
<template xmlns="http://marklogic.com/xdmp/tde"> | ||
<context>//akn:FRBRWork</context> | ||
<collections> | ||
<collection>press-summary</collection> | ||
<collection>judgment</collection> | ||
</collections> | ||
<path-namespaces> | ||
<path-namespace> | ||
<prefix>akn</prefix> | ||
<namespace-uri>http://docs.oasis-open.org/legaldocml/ns/akn/3.0</namespace-uri> | ||
</path-namespace> | ||
<path-namespace> | ||
<prefix>uk</prefix> | ||
<namespace-uri>https://caselaw.nationalarchives.gov.uk/akn</namespace-uri> | ||
</path-namespace> | ||
<path-namespace> | ||
<prefix>prop</prefix> | ||
<namespace-uri>http://marklogic.com/xdmp/property</namespace-uri> | ||
</path-namespace> | ||
</path-namespaces> | ||
<rows> | ||
<row> | ||
<schema-name>documents</schema-name> | ||
<view-name>summary</view-name> | ||
<columns> | ||
<column> | ||
<name>uri</name> | ||
<scalar-type>string</scalar-type> | ||
<val>xdmp:node-uri(.)</val> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>ncn</name> | ||
<scalar-type>string</scalar-type> | ||
<val>sem:iri(//uk:cite//text())</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>name</name> | ||
<scalar-type>string</scalar-type> | ||
<val>sem:iri(//akn:FRBRWork/akn:FRBRname/@value)</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
</columns> | ||
</row> | ||
</rows> | ||
</template> |
59 changes: 59 additions & 0 deletions
59
src/main/ml-schemas/tde/sql-document-properties-extract.xsd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
<template xmlns="http://marklogic.com/xdmp/tde"> | ||
<path-namespaces> | ||
<path-namespace> | ||
<prefix>prop</prefix> | ||
<namespace-uri>http://marklogic.com/xdmp/property</namespace-uri> | ||
</path-namespace> | ||
<path-namespace> | ||
<prefix>dls</prefix> | ||
<namespace-uri>http://marklogic.com/xdmp/dls</namespace-uri> | ||
</path-namespace> | ||
</path-namespaces> | ||
<context>prop:properties</context> | ||
<rows> | ||
<row> | ||
<schema-name>documents</schema-name> | ||
<view-name>propertysummary</view-name> | ||
<columns> | ||
<column> | ||
<name>uri</name> | ||
<scalar-type>string</scalar-type> | ||
<val>xdmp:node-uri(.)</val> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>doc_uri</name> | ||
<scalar-type>string</scalar-type> | ||
<val>dls:version/dls:document-uri</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>version_number</name> | ||
<scalar-type>integer</scalar-type> | ||
<val>dls:version/dls:version-id</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>modified</name> | ||
<scalar-type>dateTime</scalar-type> | ||
<val>prop:last-modified</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
<column> | ||
<name>published</name> | ||
<scalar-type>string</scalar-type> | ||
<val>published</val> | ||
<nullable>true</nullable> | ||
<invalid-values>ignore</invalid-values> | ||
<reindexing>visible</reindexing> | ||
</column> | ||
</columns> | ||
</row> | ||
</rows> | ||
</template> |