-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml-catalog-resolver.xqm
214 lines (188 loc) · 9.18 KB
/
xml-catalog-resolver.xqm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
(:~
: This module provides an XML Resolver for OASIS XML Catalogs entirely in XQuery.
: Use this module to resolve the location a DTD specified in a DOCTYPE before parsing XML.
:
: Tested with BaseX versions 9.7.3 and 11.5
: May work in other XQuery processors.
:
: @author Vincent Lizzi
: @see https://github.com/vincentml/xml-catalog-resolver
: @see https://basex.org/
: @see https://docs.basex.org/wiki/Options#CATFILE
: @see https://xmlresolver.org/
: @see https://xerces.apache.org/xml-commons/components/resolver/resolver-article.html
: @see http://www.sagehill.net/docbookxsl/WriteCatalog.html
:)
module namespace resolver = "xml-catalog-resolver";
import module namespace file = "http://expath.org/ns/file";
(:~ Namespace for OASIS XML Catalogs :)
declare namespace catalog = "urn:oasis:names:tc:entity:xmlns:xml:catalog";
(:~ regular expression to match space characters in XML DOCTYPE :)
declare variable $resolver:space := '[ 	
]+';
(:~ regular expression to match the beginning of an XML DOCTYPE up to the root element name :)
declare variable $resolver:doctype_start := '(<!DOCTYPE' || $resolver:space || '[:_A-Za-zÀ-ÖØ-öø-˿Ͱ-ͽͿ-῿‌-‍⁰-↏Ⰰ-⿯、-퟿豈-﷏ﷰ-�𐀀-][:_\.\-0-9A-Za-z·̀-ͯ‿-⁀À-ÖØ-öø-˿Ͱ-ͽͿ-῿‌-‍⁰-↏Ⰰ-⿯、-퟿豈-﷏ﷰ-�𐀀-]*' || $resolver:space || ')';
(:~
: Parse XML Catalog and return a list of all catalog entries with URIs expanded.
: URI expansion will be based on file paths relative to the XML Catalog file or
: the @xml:base attribute if present in the XML Catalog.
:
: @param $catalog Semicolon-separated list of XML catalog files. Absolute file path works best.
:
: @return Sequence of all entries from all XML Catalogs that were loaded.
:)
declare function resolver:catalogEntries($catalog as xs:string) as element()* {
for $cat in tokenize($catalog, ';\s*') return
let $catxml := (# db:dtd false #) (# db:intparse true #) { doc(file:resolve-path($cat)) }
let $catparent := file:parent($cat)
for $e in $catxml//*
let $base :=
if ($e/ancestor-or-self::catalog:*/@xml:base)
then ($e/ancestor-or-self::catalog:*/@xml:base)[last()]/string()
else $catparent
return typeswitch ($e)
case
element(catalog:system) |
element(catalog:systemSuffix) |
element(catalog:public) |
element(catalog:uri) |
element(catalog:uriSuffix)
return resolver:expandUri($e, $base)
case
element(catalog:nextCatalog) |
element(catalog:delegatePublic) |
element(catalog:delegateSystem) |
element(catalog:delegateURI)
return resolver:catalogEntries(file:resolve-path($e/@catalog, $base))
case
element(catalog:rewriteSystem) |
element(catalog:rewriteURI)
return $e
default return ()
};
declare %private function resolver:expandUri($entry as element(), $base as xs:string) as element() {
copy $c := $entry
modify replace value of node $c/@uri with file:path-to-uri(file:resolve-path($entry/@uri, $base))
return $c
};
declare function resolver:regexEscapeString($string as xs:string) as xs:string {
$string => replace("([\|\\\{\}\(\)\[\]\^\$\+\*\?\.])", "\\$1")
};
(:~
: Resolve XML DOCTYPE using XML Catalog.
: The system literal URI in the DOCTYPE will be replaced with location provided by the XML Catalog.
: The replacement strategy uses regular expressions that closely adhere to the grammar that is defined in the W3C Recommendation.
: No attempt has been made to skip text that looks like a DOCTYPE but isn't, such as a DOCTYPE that is inside a comment.
:
: @param $xml XML as a string
: @param $catalog Semicolon-separated list of XML catalog files. Absolute file path works best.
:
: @return XML string with DOCTYPE resolved using the XML Catalog. If no mapping is found then the string is returned unchanged.
:
: @see https://www.w3.org/TR/xml/#NT-doctypedecl
:)
declare function resolver:resolveDOCTYPE($xml as xs:string, $catalog as xs:string) as xs:string {
let $cat := resolver:catalogEntries($catalog)
return fold-left($cat, $xml, function($x, $c) {
typeswitch ($c)
case element(catalog:public) return
let $public := resolver:regexEscapeString($c/@publicId)
let $match := $resolver:doctype_start || 'PUBLIC' || $resolver:space || '("' || $public || '"|' || "'" || $public || "')" || $resolver:space || "('[^']*'|" || '"[^"]*")'
let $replace := '$1PUBLIC $2 "' || $c/@uri || '"'
return replace($x, $match, $replace)
case element(catalog:system) return
let $system := resolver:regexEscapeString($c/@systemId)
let $match := $resolver:doctype_start || "(PUBLIC" || $resolver:space || "(?:'[^']*'|""[^""]*"")|SYSTEM)" || $resolver:space || "('" || $system || "'|""" || $system || """)"
let $replace := '$1$2 "' || $c/@uri || '"'
return replace($x, $match, $replace)
case element(catalog:systemSuffix) return
let $system := resolver:regexEscapeString($c/@systemIdSuffix)
let $match := $resolver:doctype_start || "(PUBLIC" || $resolver:space || "(?:'[^']*'|""[^""]*"")|SYSTEM)" || $resolver:space || "('[^']*" || $system || "'|""[^""]*" || $system || """)"
let $replace := '$1$2 "' || $c/@uri || '"'
return replace($x, $match, $replace)
case element(catalog:rewriteSystem) return
let $system := resolver:regexEscapeString($c/@systemIdStartString)
let $match := $resolver:doctype_start || "(PUBLIC" || $resolver:space || "(?:'[^']*'|""[^""]*"")|SYSTEM)" || $resolver:space || "(?:'" || $system || "([^']*)'|""" || $system || "([^""]*)"")"
let $replace := '$1$2 "' || $c/@rewritePrefix || '$3$4"'
return replace($x, $match, $replace)
default return $x
})
};
(:~
: Resolve a URI using XML Catalog.
:
: @param $uri The URI to resolve
: @param $catalog Semicolon-separated list of XML catalog files. Absolute file path works best.
:
: @return The resolved URI. If no mapping is found in the XML Catalog the URI will be returned unchanged.
:)
declare function resolver:resolveURI($uri as xs:string, $catalog as xs:string) as xs:string {
let $cat := resolver:catalogEntries($catalog)
return fold-left($cat, $uri, function($x, $c) {
typeswitch ($c)
case element(catalog:uri) return
if ($c/@name eq $uri) then string($c/@uri) else $x
case element(catalog:uriSuffix) return
if (ends-with($x, $c/@uriSuffix))
then string($c/@uri)
else $x
case element(catalog:rewriteURI) return
if (starts-with($x, $c/@uriStartString))
then concat($c/@rewritePrefix, substring-after($x, $c/@uriStartString))
else $x
default return $x
})
};
(:~
: Parse XML using XML Catalog
:
: @param $xml an XML string or file path to the XML file
: @param $catalog Semicolon-separated list of XML catalog files. Absolute file path works best.
:
: @return parsed XML document
:)
declare function resolver:parse-xml($xml as xs:string, $catalog as xs:string) as document-node() {
let $temp := file:create-temp-file('xml-catalog-resolver', '.xml')
let $raw := if ($xml castable as xs:anyURI) then unparsed-text($xml) else $xml
let $resolved := resolver:resolveDOCTYPE($raw, $catalog)
return (
file:write-text($temp, $resolved),
(# db:dtd true #) (# db:intparse false #) { doc($temp) },
file:delete($temp)
)
};
(:~
: Parse XML using XML Catalog
:
: @param $xml an XML string or file path to the XML file
: @param $catalog Semicolon-separated list of XML catalog files. Absolute file path works best.
: @param $path File path to a location where the XML will be written before being parsed in order to control base-uri()
:
: @return parsed XML document
:)
declare function resolver:parse-xml($xml as xs:string, $catalog as xs:string, $path as xs:string) as document-node() {
let $raw := if ($xml castable as xs:anyURI) then unparsed-text($xml) else $xml
let $resolved := resolver:resolveDOCTYPE($raw, $catalog)
return (
file:write-text($path, $resolved),
(# db:dtd true #) (# db:intparse false #) { doc($path) }
)
};
(:~
: Modifies a DOCTYPE to remove a PUBLIC or SYSTEM reference to an external DTD.
: If the DOCTYPE contains an internal DTD then the internal part will remain intact.
: The intention for this function is to prevent loading an external DTD when it is
: known that the DTD is not needed, and mainly for parsing XML Catalogs.
: With BaseX parsing options set to INTPARSE=true and DTD=false this function is not needed.
:
: @param $xml XML as a string
:
: @return XML string with the DOCTYPE modified. If no PUBLIC or SYSTEM reference is present then the string is returned unchanged.
:
: @see https://www.w3.org/TR/xml/#NT-doctypedecl
: @see https://docs.basex.org/wiki/Options#INTPARSE
: @see https://docs.basex.org/wiki/Options#DTD
:)
declare function resolver:removeExternalDTD($xml as xs:string) as xs:string {
let $match := $resolver:doctype_start || "(PUBLIC" || $resolver:space || "(?:'[^']*'|""[^""]*"")|SYSTEM)" || $resolver:space || "(?:'[^']*'|""[^""]*"")"
return replace($xml, $match, "$1")
};