From f9a92572f79710d6085f2522c5547afcb24b7334 Mon Sep 17 00:00:00 2001 From: David McKee Date: Fri, 13 Dec 2024 15:15:36 +0000 Subject: [PATCH] Use corb2 to set identifiers for all primary documents that aren't failures --- .gitignore | 2 ++ corb2/README.md | 7 +++++++ corb2/corb | 1 + corb2/get-uris.xqy | 4 ++++ corb2/migrate-ncn.properties | 9 +++++++++ corb2/migrate-ncn.xqy | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+) create mode 100644 corb2/README.md create mode 100755 corb2/corb create mode 100644 corb2/get-uris.xqy create mode 100644 corb2/migrate-ncn.properties create mode 100644 corb2/migrate-ncn.xqy diff --git a/.gitignore b/.gitignore index 2036290..30499b3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ gradle-*.properties !gradle-development.properties __pycache__ node_modules/ +corb2/*.jar +corb2/*.log diff --git a/corb2/README.md b/corb2/README.md new file mode 100644 index 0000000..df6e42a --- /dev/null +++ b/corb2/README.md @@ -0,0 +1,7 @@ +`brew install temurin` to get a working Java (I had to install-uninstall-install) + +Download corb2 from https://github.com/marklogic-community/corb2/releases +Download xcc from https://repo1.maven.org/maven2/com/marklogic/marklogic-xcc/11.1.0/ or https://developer.marklogic.com/products/xcc-2/ (I used maven) +(both into this directory) + +`corb migrate-ncn` will run the code, `migrate-ncn.log` will show what it writes to the identifiers diff --git a/corb2/corb b/corb2/corb new file mode 100755 index 0000000..387f6f2 --- /dev/null +++ b/corb2/corb @@ -0,0 +1 @@ +java -server -cp .:marklogic-xcc-11.1.0.jar:marklogic-corb-2.5.6.jar -DOPTIONS-FILE=$1.properties com.marklogic.developer.corb.Manager xcc://admin:admin@localhost:8011 diff --git a/corb2/get-uris.xqy b/corb2/get-uris.xqy new file mode 100644 index 0000000..df6c30d --- /dev/null +++ b/corb2/get-uris.xqy @@ -0,0 +1,4 @@ +let $uris := cts:uris("",(),cts:collection-query( + ("http://marklogic.com/collections/dls/latest-version") +)) +return (count($uris), $uris) diff --git a/corb2/migrate-ncn.properties b/corb2/migrate-ncn.properties new file mode 100644 index 0000000..e0162ae --- /dev/null +++ b/corb2/migrate-ncn.properties @@ -0,0 +1,9 @@ +MODULES-DATABASE=caselaw-modules +INSTALL = 1 + +URIS-MODULE=get-uris.xqy +PROCESS-MODULE=migrate-ncn.xqy + +PROCESS-TASK=com.marklogic.developer.corb.ExportBatchToFileTask +EXPORT-FILE-NAME=migrate-ncn.log +THREAD-COUNT=10 diff --git a/corb2/migrate-ncn.xqy b/corb2/migrate-ncn.xqy new file mode 100644 index 0000000..35db7d7 --- /dev/null +++ b/corb2/migrate-ncn.xqy @@ -0,0 +1,34 @@ +declare namespace uk = "https://caselaw.nationalarchives.gov.uk/akn"; + +import module namespace sem = "http://marklogic.com/semantics" + at "/MarkLogic/semantics.xqy"; + +(: This is intended to be a migration script that runs once; + it should not be run on a database which already has identifiers :) + +declare variable $URI external; + +let $cite := fn:doc($URI)//uk:cite/text() +let $slug := fn:replace( + fn:replace($URI, "\.xml$", "") + , "^/", "") +let $log := ("") +let $uuid := "id-"||sem:uuid-string() +let $log := ($log, "Processing", $URI, $cite, $uuid) +let $node := + +ukncn +{$uuid} +{$cite} +{$slug} + +let $log := ($log, xdmp:quote($node)) + +let $log := ($log, if + (fn:starts-with($URI, "/failures/") or fn:starts-with($URI, "/collisions/")) + then + "ignored as failure/collision" + else + "set property" || xdmp:document-set-property($URI, $node)) + +return string-join($log, " ")