-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresolver.rb
1195 lines (1038 loc) · 40.3 KB
/
resolver.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
require 'intertwingler/error'
require 'intertwingler/graphops'
require 'intertwingler/vocab'
require 'intertwingler/loggable'
require 'intertwingler/util/clean'
require 'uri'
require 'uuidtools'
require 'uuid/ncname'
require 'digest'
require 'base64'
# This class is intended to be a caching URI (and URI-adjacent)
# resolver, intended to persist only as long as it needs to, as the
# cache is not very sophisticated.
class Intertwingler::Resolver
include Intertwingler::Loggable
include Intertwingler::Util::Clean
private
ITCV = Intertwingler::Vocab::ITCV
TFO = Intertwingler::Vocab::TFO
XSD = RDF::XSD
SH = RDF::Vocab::SH
def self.configure_one repo, subject, log: nil
# 0) assert that the subject is indeed a resource
subject = coerce_resource subject
# 1) get site under management and aliases
base = repo.objects_for(subject, ITCV.manages, only: :uri).sort.first
aliases = repo.objects_for(subject, ITCV.alias, only: :uri).sort
# 2) get prefixes and vocab
prefixes = repo.objects_for(
subject, ITCV.prefix, only: :resource).reduce({}) do |h, n|
pfx = repo.objects_for(n, SH.prefix, only: :literal).sort.first
uri = repo.objects_for(n,
SH.namespace, only: :literal, datatype: XSD.anyURI).sort.first
h[pfx.value.to_sym] = uri.value if pfx && uri
h
end
vocab = repo.objects_for(subject, ITCV.vocab).select do |o|
o.iri? or o.literal? && o.datatype? && o.datatype == XSD.anyURI
end.sort.first
prefixes[nil] = vocab.value if vocab
# 3) get document and fragment specifications
documents = repo.objects_for(subject, ITCV.document, only: :uri)
fragments = repo.objects_for(
subject, ITCV[:'fragment-list'], only: :resource)
unless fragments.empty?
fragments = RDF::List.new(
subject: fragments.first, graph: repo).to_a.map do |f|
# fragment class
c = repo.objects_for(f, ITCV[:'fragment-spec'], only: :uri)
# via shacl property path
v = repo.objects_for(f, ITCV.via, only: :resource).map do |path|
repo.process_shacl_path path
end
# host class
h = repo.objects_for(f, ITCV[:'host-class'], only: :uri)
# exceptions to host
e = repo.objects_for(f, ITCV[:'except-class'], only: :uri)
[c, v, h, e]
end
end
self.new repo, base, aliases: aliases, prefixes: prefixes,
subject: subject, documents: documents, fragments: fragments, log: log
end
# this is dumb but we have both URI and RDF::URI in the mix
def set_alias uri, base
# this will work whether it's URI or RDF::URI
return uri unless /^https?$/i.match? uri.scheme and
base and base.respond_to? :authority and
authorities.include? base.authority.downcase
# we have to dup in case it's frozen
uri = uri.dup
uri.scheme = base.scheme
uri.host = base.host
uri.port = base.port
uri
end
public
# Return a hash mapping a set of RDF prefixes to their vocabularies.
#
# @param prefixes [Hash, #to_h] the input prefixes
# @param nonnil [false, true] whether to remove the nil prefix
#
# @return [Hash{Symbol=>RDF::Vocabulary}] sanitized prefix map
#
def sanitize_prefixes prefixes, nonnil: false
Intertwingler::Util::Clean.sanitize_prefixes prefixes,
nonnil: nonnil, cache: @vocabs
end
def self.locate repo
repo.all_of_type Intertwingler::Vocab::ITCV.Resolver
end
def self.configure repo, subject: nil, authority: nil, log: nil
if subject
return subject.map { |s| configure_one repo, s} if subject.is_a? Array
configure_one repo, subject
elsif authority
candidates = %w[http https].map do |scheme|
base = RDF::URI("#{scheme}://#{authority}/")
repo.subjects_for(ITCV.manages, base, only: :resource)
end.flatten.uniq
case candidates.size
when 1 then return configure_one repo, candidates.first
when 0 then raise Intertwingler::Error::Config,
"No resolver found for #{authority}"
else raise Intertwingler::Error::Config,
'Multiple resolvers identified for %s: %s' %
[authority, candidates.join(', ')]
end
else
locate(repo).map { |s| configure_one repo, r, log: log }
end
end
attr_reader :repo, :base, :aliases, :prefixes, :id, :documents, :fragments
alias_method :subject, :id
# Create a new URI resolver.
#
# @param repo [RDF::Repository] where we get our data from
# @param base [URI, RDF::URI] base _URL_ (as in dereferenceable)
# @param aliases [Array<URI, RDF::URI>] alternative base URLs to
# be treated as equivalent in lookups
# @param prefixes [Hash{Symbol, nil => RDF::Term}] the prefix map
#
def initialize repo, base, aliases: [], prefixes: {}, subject: nil,
documents: [], fragments: [], log: nil
@repo = repo
raise ArgumentError, 'repo must be RDF::Queryable' unless
repo.is_a? RDF::Queryable
# set the base uri; store it as as a URI rather than RDF::URI
@base = coerce_resource base, as: :uri
@aliases = coerce_resources aliases, as: :uri
@prefixes = sanitize_prefixes prefixes
@id = coerce_resource subject if subject
@documents = documents.to_set
@fragments = fragments # this is in order
# cache of subjects in the graph so we only look them up once
@subjects = {}
# cache of URIs (likely but not necessarily UUIDs) to host
# documents (UUIDs), or nils where the URIs are themselves full
# documents
@hosts = {}
# uri -> uuid cache
@uuids = {}
# uuid -> uri cache
@uris = {}
# map uri.to_s to rdf vocab
@vocabs = {}
@log = log
end
# Clear the resolver's caches but otherwise keep its configuration.
#
# @return [true] constant true return value that can be ignored
#
def flush
# empty em all out
[@subjects, @hosts, @uuids, @uris, @vocabs].each(&:clear)
# this is a throwaway result mainly intended to mask what would
# otherwise return the array
true
end
private
R3986 = /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/
SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,:;=._~-]/n
RFC3986 =
/^(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]+)?(?:\?([^#]*))?(?:#(.*))?$/
SEPS = [['', ?:], ['//', ''], ['', ''], [??, ''], [?#, '']].freeze
public
# Return the set of URI authorities (host-port pairs) associated
# with this resolver, beginning with the base and following with
# the aliases.
#
# @return [Array<String>] the authorities
#
def authorities
([@base] + @aliases).select { |u| u.respond_to? :authority }.map do |u|
u.authority.to_s.downcase
end.uniq
end
# Preprocess a candidate (absolute or relative) URI to produce a
# valid URI string with the minimally valid set of escaped
# characters. That is, characters that are unnecessarily
# percent-encoded (`%XX`) are turned back to their ASCII
# originals, while ensuring that characters that absolutely _must_
# be encoded, remain as such.
#
# @param uri [#to_s] the URI, absolute or relative, treated as a
# string
# @param extra [#to_s] a set of additional characters to be
# escaped, treated as a {Regexp} character class
#
# @return [String] the preprocessed URI string
#
def self.preproc uri, extra = ''
# take care of malformed escapes
uri = uri.to_s.b.gsub(/%(?![0-9A-Fa-f]{2})/n, '%25')
# escape the extras if they exist
uri.gsub!(/([#{Regexp.quote extra}])/) { |s| '%%%02X' % s.ord } unless
extra.empty?
# we want the minimal amount of escaping so we split out the separators
out = ''
parts = RFC3986.match(uri).captures
parts.each_index do |i|
next if parts[i].nil?
out << SEPS[i].first
out << parts[i].b.gsub(SF) { |s| '%%%02X' % s.ord }
out << SEPS[i].last
end
# make sure escaped hex is upper case like the rfc says
out.gsub(/(%[0-9A-Fa-f]{2})/, &:upcase)
end
define_method :preproc, self.singleton_method(:preproc).to_proc
alias_method :preprocess, :preproc
# Given a URI as input, split any path parameters out of the last
# path segment. Works the same way as #split_pp.
#
# @param uri [URI, #to_s] The URI to extract parameters from
# @param only [false, true] whether to only return the parameters
# @param parse [false, true] whether to parse the path parameters
# @return [Array] (See description)
#
def self.split_pp uri, only: false, parse: false
begin
u = (uri.is_a?(URI) ? uri : URI(preproc uri.to_s)).normalize
rescue URI::InvalidURIError => e
# these stock error messages don't even tell you what the uri is
raise URI::InvalidURIError, "#{e.message} (#{uri.to_s})"
end
return only ? [] : [uri] unless u.path
uri = u
ps = uri.path.split ?/, -1
pp = ps.pop.split ?;, -1
bp = (ps + [pp.shift]).join ?/
uri = uri.dup
pp = pp.map do |param|
k, v = param.split ?=, 2
v = v.split ?, if v
[k, v || []]
end if parse
begin
uri.path = bp
rescue URI::InvalidURIError => e
# these stock error messages don't even tell you what the uri is
m = e.message
raise URI::InvalidURIError, "#{m} (#{uri.to_s}, #{bp})"
end
return pp if only
[uri] + pp
end
define_method :split_pp, self.singleton_method(:split_pp).to_proc
# Given a URI as input, split any query parameters into an array of
# key-value pairs. If `:only` is true, this will just return the
# pairs. Otherwise it will prepend the query-less URI to the array,
# and can be captured with an idiom like `uri, *qp = split_qp uri`.
#
# @param uri [RDF::URI, URI, #to_s] The URI to extract parameters from
# @param only [false, true] whether to only return the parameters
#
# @return [Array] (See description)
#
def split_qp uri, only: false, as: :uri
uri = coerce_resource uri, as: as
if uri.query
qp = URI::decode_www_form(uri.query)
return qp if only
uri.query = nil
[uri] + qp
elsif only
[]
else
[uri]
end
end
# Get the fragment, or otherwise the last non-empty path segment
# of the URI. Returns `nil` if the URI is not the kind that has a
# path.
#
# @param uri [URI, RDF::URI, #to_s] the URI input
#
# @return [String, nil] the slug, if present
#
def terminal_slug uri
uri = coerce_resource uri, as: :uri
#
return unless uri.respond_to? :path
if f = uri.fragment and not f.empty?
return f
elsif p = uri.path
# we reuse p a bunch but whatever we're dynamically typed here
if p = /^\/+(.*?)\/*$/.match(p)
if p = p[1].split(/\/+/).last
# we need to escape colons or consumers will think it's absolute
return preproc(p.split(/;+/).first || '', ':')
end
end
end
'' # can't remember why but the default return value is empty string
end
private
UUID_ONLY = /\b([0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8})\b/i
UUID_RE = /^(?:urn:uuid:)?#{UUID_ONLY}$/i
UUID_PATH = /^\/+#{UUID_ONLY}/
# lol ruby booleans do not coerce to integers so here we are
BITS = { nil => 0, false => 0, true => 1 }.freeze
public
define_singleton_method :coerce_resource,
Intertwingler::Util::Clean.method(:coerce_resource).unbind
# Coerce the argument into a resource, either {URI} or {RDF::URI}
# (or {RDF::Node}). The type can be specified
#
# @param arg [#to_s, URI, RDF::URI, RDF::Node] the argument to
# coerce into a resource
# @param as [:rdf, :uri, :term, false, nil] how to coerce the result
#
# @return [RDF::URI, URI, RDF::Vocabulary::Term, RDF::Vocabulary, String]
#
def coerce_resource arg, as: :rdf
# again self.class is suuuuuuuuper slow
Intertwingler::Util::Clean.coerce_resource arg, as: as do |arg|
begin
@base ? @base.merge(preproc arg.to_s.strip) : arg
rescue URI::InvalidURIError => e
warn "attempted to coerce #{arg} which turned out to be invalid: #{e}"
nil
end
end
end
# Return the UUID(s) associated with the subject. May return
# `nil` or be a no-op, if specified.
#
# @param subject [URI, RDF::URI, #to_s] the URI to resolve
# @param scalar [true, false] whether to return only one UUID if
# more than one is resolved; always returns an array if false
# @param verify [true, false] whether any UUID found in the input
# should be resolved against the graph
# @param as [:rdf, :uri, :str] coerce the output to either
# {RDF::URI}, {URI}, or string literal, respectively
# @param published [false, true] whether to constrain the
# UUID resolution to published documents only
# @param noop [false, true] whether to return `nil` or otherwise
# just echo back the input if a UUID can't be resolved
#
# @return [URI, RDF::URI, Array<URI, RDF::URI>, nil]
#
def uuid_for uri, scalar: true, verify: true, as: :rdf,
published: false, circulated: false, noop: false
# this ensures the input is an absolute RDF::URI
orig = uri = coerce_resource uri
# do an initial check here to determine if we already have a UUID
unless uri.is_a? RDF::Node
# swap in our canonical scheme/authority
if %w[http https].include? uri.scheme.to_s.downcase and
uri.authority and authorities.include? uri.authority.downcase
uri = uri.dup # duplicate because frozen sometimes
uri.scheme = @base.scheme
uri.authority = @base.authority
end
# we make a URI object so we can test it easier
tu = URI(preproc uri).normalize
# if we find a uuid in the path, we extract it
if tu.path && !tu.fragment &&
UUID_RE.match?(uu = tu.path.delete_prefix(?/))
tu = URI('urn:uuid:' + uu.downcase)
end
# unconditionally overwrite the URI
uri = RDF::URI(tu.to_s)
# warn "hmm #{uri}"
# now we check for a compact UUID fragment or UUID URN
if tu.fragment and
(uu = UUID::NCName.from_ncname(tu.fragment, validate: true))
# this is the special case that the fragment is a compact uuid
uu = RDF::URI("urn:uuid:#{uu}")
if !verify or @subjects[uu] ||= @repo.has_subject?(uu)
uu = coerce_resource uu, as: as
return scalar ? uu : [uu]
end
elsif tu.respond_to? :uuid
# in this case the URI is already a UUID, so now we check
# if it's a subject
if !verify or @subjects[uri] ||= @repo.has_subject?(uri)
uri = coerce_resource uri, as: as
return scalar ? uri : [uri]
end
end
end
# warn "test #{uri}"
# return our result from cache if present
if out = @uuids[orig]
out = coerce_resources out, as: as
return scalar ? out.first : out
end
# this (assuming the input has a path) will give us a stack of
# URIs containing successively fewer path parameters (if there
# are no path parameters or no path, then there will only be the
# one URI).
uris = if uri.respond_to? :path and uri.path.start_with? ?/
# split off path parameters
uu, *pp = split_pp uri
if pp.empty?
[uri] # no path parameters; this is a noop
else
uu = RDF::URI(uu.to_s)
bp = uu.path # base path
# this counts down from all parameters to zero
(0..pp.length).to_a.reverse.map do |i|
u = uu.dup
u.path = ([bp] + pp.take(i)).join(?;)
u # uri with the first 0..i path parameters
end
end
else
[uri] # URI does not have a path
end
# prior to other criteria, we sort UUID candidates by two
# dimensions: exact match on a URI (i.e., where the candidate is
# a resource vs whether it is just a slug), and canonicality
# (i.e., whether the relation has been demarcated as canonical).
# This list shows how the ranks map to bits, and then integers,
# so they can be compared with an ordinary <=> operator.
#
# * (00) exact & canonical == 0,
# * (01) exact == 1,
# * (10) inexact & canonical == 2,
# * (11) inexact == 3.
#
# Subsequent comparison criteria include whether the resource
# is considered "published", and its latest associated date.
# obtain the raw candidates for our stack of URIs using the
# ci:canonical/owl:sameAs mechanism, ie exact match
sa = @repo.property_set [Intertwingler::Vocab::CI.canonical,
Intertwingler::Vocab::CI.alias, RDF::OWL.sameAs]
candidates = nil
uris.each do |u|
# this will give us a hash where the keys are
candidates = @repo.subjects_for(sa, u, entail: false) do |s, f|
# skip non-uuid subjects
next unless UUID_RE.match? s
[s, {
# we xor this because BITS[true] ^ 1 == 0, and 0 < 1
rank: BITS[f.include? Intertwingler::Vocab::CI.canonical] ^ 1,
published: @repo.published?(s, circulated: circulated),
ctime: @repo.dates_for(s,
predicate: RDF::Vocab::DC.created).last || DateTime.new,
mtime: @repo.dates_for(s).last || DateTime.new }]
end.compact.to_h
# this is a funny way to say quit on the first match
break unless candidates.empty?
end
# after we have checked the URI(s) verbatim against the graph,
# but before we start checking slugs, we can try to harvest some
# host documents, assuming out URI has a fragment.
hosts = if uri.uri? and uri.fragment and not uri.fragment.empty?
tmp = uri.dup
tmp.fragment = nil
h = uuid_for tmp, scalar: false, published: published,
circulated: circulated, noop: noop
# a fragment URI for which the non-fragment part does
# not resolve to a UUID should likewise not resolve
# (XXX: or should it?)
return scalar ? nil : [] if h.empty?
h # the hosts
end
# okay *now* do the slugs
slug = terminal_slug uri
if slug and slug != ''
exact = uri == coerce_resource(slug)
sl = [Intertwingler::Vocab::CI['canonical-slug'],
Intertwingler::Vocab::CI.slug]
[RDF::XSD.string, RDF::XSD.token].each do |t|
repo.subjects_for(sl, RDF::Literal(slug, datatype: t)) do |s, f|
# skip non-uuid subjects
next unless UUID_RE.match? s
entry = candidates[s] ||= {
rank: 0b11,
published: @repo.published?(s, circulated: circulated),
ctime: @repo.dates_for(s,
predicate: RDF::Vocab::DC.created).last || DateTime.new,
mtime: @repo.dates_for(s).last || DateTime.new }
# reset the rank if it is a lower number (higher rank)
rank = (BITS[exact] << 1 | BITS[f.include? sl.first]) ^ 0b11
entry[:rank] = rank if rank < entry[:rank]
end
end
end
# okay now that we have the candidates, let's make sure, e.g.,
# that the fragment actually maps to a host
if hosts
# XXX jklol this needs to be implemented
end
# here is where we go sniffing for replacements. we turn the
# candidates hash into an array to iterate over it because we
# mess with the actual candidates hash in the loop
candidates.to_a.each do |k, v|
# find any replacements
reps = @repo.replacements_for(k, published: published) - [k]
#
unless reps.empty?
v[:replaced] = true
reps.each do |r|
c = candidates[r] ||= {
rank: v[:rank], published: @repo.published?(r),
ctime: @repo.dates_for(r, predicate: RDF::Vocab::DC.created).last ||
v[:ctime] || DateTime.new,
mtime: @repo.dates_for(r).last || v[:mtime] || DateTime.new }
# adjust rank and modification time of the replacement to
# that of the replaced if they are more favourable
c[:rank] = v[:rank] if v[:rank] < c[:rank]
c[:mtime] = v[:mtime] if v[:mtime] > c[:mtime]
c[:ctime] = v[:ctime] if v[:ctime] > c[:ctime]
end
end
end
# now we can remove all unpublished candidates if the context is
# published
candidates.select! do |_, v|
!v[:replaced] && (published ? v[:published] : true)
end
out = candidates.sort do |a, b|
# we are mainly interested in the structs we generated
ax, bx = [a, b].map(&:last)
# check publication status (contingent), rank, then modification time
c = published ? BITS[bs[:published]] <=> BITS[as[:published]] : 0
c = ax[:rank] <=> bx[:rank] if c == 0
c = bx[:mtime] <=> ax[:mtime] if c == 0
c = bx[:ctime] <=> ax[:ctime] if c == 0
# finally compare lexically if none of the others resolve
c == 0 ? a.first <=> b.first : c
end.map(&:first).compact
if out.empty?
# ensure we return noop
out << orig if noop
else
# cache if there is something to cache
@uuids[orig] = out
end
# make these into uri objects if requested
out = coerce_resources out, as: as
# return the first (ie most preferred) UUID
scalar ? out.first : out
end
# Return the (hopefully dereferenceable) URI(s) associated with
# the subject. This will always return something even if it is a
# no-op, such as URIs that are valid but not present in the graph.
# A resource determined to be a document fragment will be resolved
# to its host document and appended as a fragment identifier.
# Blank nodes are skolemized, resolved to `/.well-known/genid/...`
# if they can't be resolved to a host document. UUIDs (in
# canonical or UUID-NCName form) are resolved (again, if not
# fragments) as a single path segment off the base (`/<uuid>`).
# CURIEs are expanded into their respective terms.
#
# @param subject [RDF::URI, RDF::Node, URI, String] URI, blank
# node, UUID, or CURIE to be resolved
# @param as [:rdf, :uri] coerce the output to one or the other form
# @param relative [false, true] return relative to base
# @param roundtrip [false, true] resolve UUID first
# @param slugs [false, true] attempt to resolve from slugs as well
# @param fragments [true, false] resolve fragment URIs
# @param local [false, true] only return URIs with the same
# authority as the base
# @param via [URI, RDF::URI] a base URI, e.g. one of the aliases
#
# @return [URI, RDF::URI, Array<URI, RDF::URI>] the URI(s)
#
def uri_for term, scalar: true, as: :rdf, relative: false, roundtrip: false,
slugs: false, fragments: true, local: false, via: nil
term = coerce_resource term
if via
via = coerce_resource via
via = nil unless authorities.include? via.authority.downcase
end
# harvest uuid out of term if present
uuid = case
when m = UUID_RE.match(term.to_s)
RDF::URI("urn:uuid:#{m.captures.first.downcase}")
when term.respond_to?(:fragment) &&
v = UUID::NCName.valid?(term.fragment)
RDF::URI(UUID::NCName.from_ncname(
term.fragment, version: v, format: :urn))
end
# warn "wat #{uuid} -> #{term}"
# now we do the round trip if called for
if tmp = roundtrip ? uuid_for(uuid || term) : uuid
term = tmp
else
term = coerce_resource term, as: as
term = set_alias term, via
return scalar ? term : [term]
end
# give us the host uri if available
hosturi = if uuid
# XXX what do we do about explicit graphs? also published?
host = host_for uuid
# note function-level scope of hosturi
uri_for(host, slugs: true, via: via) if host
end
# create an appropriate map function depending on whether there
# is a host URI so the condition is only tested once
umap = if hosturi
lambda do |o|
h = hosturi.dup
h.fragment = o.value
h
end
else
lambda { |o| @base + o.value }
end
# generate a comparator proc
cmp = @repo.cmp_resource prioritize: [@base] + @aliases
# obtain a sorted list of primary URIs (those identified by
# ci:canonical and ci:canonical-slug)
primary = @repo.objects_for(term,
Intertwingler::Vocab::CI.canonical, only: :resource).sort(&cmp)
if term.uri? and (host or slugs) and (primary.empty? or not scalar)
primary += @repo.objects_for(term,
Intertwingler::Vocab::CI['canonical-slug'],
only: :literal, datatype: RDF::XSD.token).map(&umap).sort(&cmp)
end
secondary = []
if primary.empty? or not scalar
secondary = @repo.objects_for(term,
[RDF::OWL.sameAs, Intertwingler::Vocab::CI['alias-for']],
entail: false, only: :resource).sort(&cmp)
if term.uri? and (slugs or host)
secondary += @repo.objects_for(term,
Intertwingler::Vocab::CI.slug, entail: false,
only: :literal, datatype: RDF::XSD.token).map(&umap).sort(&cmp)
end
end
# in the final case append the UUID to the base
uri = URI(preproc term)
if uri.respond_to? :uuid
if hosturi
h = hosturi.dup
h.fragment = UUID::NCName.to_ncname uri.uuid
secondary << RDF::URI(h.to_s)
else
u = @base.clone
u.query = u.fragment = nil
u.path = ?/ + uri.uuid
secondary << RDF::URI(u.to_s)
end
end
#
out = (primary + secondary).uniq
# eliminate fragment URIs unless explicitly allowed
unless fragments
tmp = out.reject(&:fragment)
out = tmp unless tmp.empty?
end
# eliminate non-local URIs
out.select! do |u|
/^https?$/i.match? u.scheme and u.authority == base.authority
end if local
# turn these into URIs if the thing says so
out.map! do |u|
u = set_alias u, via
u = URI(preproc u.to_s) if as == :uri
u
end
scalar ? out.first : out
end
# XXX 2022-05-17 NOTE THAT Intertwingler::Util::resolve_curie is more
# complex than this; it assumes you can hand it stuff from
# existing markup, so this is kinda the lite version
# Resolve a CURIE to a full URI using the embedded prefix map,
# with optional overrides. Multiple values, including CURIE
# strings containing spaces, will be split and expanded out
# individually. Safe CURIEs (as in encased in square brackets) are
# handled appropriately.
#
# @param curie [String, Array<String>] one or more CURIEs
# @param as [:rdf, :uri, :term, false, nil] coercion types
# @param scalar [true, false] whether to return a single value
# @param base [URI, RDF::URI] overriding base URI
# @param prefixes [Hash{Symbol, nil => RDF::Vocabulary}]
# overriding prefix map, if needed
#
# @return [URI, RDF::URI, Array<URI, RDF::URI>, nil]
#
def self.resolve_curie curie,
as: :term, scalar: true, base: nil, prefixes: {}, noop: false
prefixes = { rdf: RDF::RDFV }.merge(
Intertwingler::Util::Clean.sanitize_prefixes prefixes)
out = (curie.respond_to?(:to_a) ? curie.to_a : [curie]).map do |c|
Intertwingler::Util::Clean.normalize_space(c.to_s).split
end.flatten.compact.map do |c|
prefix, slug = /^\[?(?:([^:]+):)?(.*?)\]?$/.match(c).captures
prefix = prefix.to_sym if prefix
tmp = if v = prefixes[prefix]
# note that we will need another resolve_curie for
# dealing with markup
case v
when RDF::Vocabulary then v[slug]
when RDF::URI then v + slug
else RDF::URI(v.to_s + slug)
end
else
noop ? c : nil
end
tmp ? coerce_resource(tmp, as: as) : tmp
end.compact
scalar ? out.first : out
end
# Resolve a CURIE to a full URI using the embedded prefix map,
# with optional overrides. Multiple values, including CURIE
# strings containing spaces, will be split and expanded out
# individually. Safe CURIEs (as in encased in square brackets) are
# handled appropriately.
#
# @param curie [String, Array<String>] one or more CURIEs
# @param as [:rdf, :uri, :term, false, nil] coercion types
# @param scalar [true, false] whether to return a single value
# @param base [URI, RDF::URI] overriding base URI
# @param prefixes [Hash{Symbol, nil => RDF::Vocabulary}]
# overriding prefix map, if needed
#
# @return [URI, RDF::URI, Array<URI, RDF::URI>, nil]
#
def resolve_curie curie, as: :term, scalar: true, base: nil,
prefixes: {}, noop: false
# override the base if present
base = base ? coerce_resource(base, as: :uri) : @base
# smush together any overriding prefixes
prefixes = @prefixes.merge(sanitize_prefixes prefixes)
self.class.resolve_curie curie, as: as, scalar: scalar,
base: base, prefixes: prefixes, noop: noop
end
# Abbreviate one or more URIs into one or more CURIEs if we
# can. Will through if `noop:` is true, or if false, return `nil`
# for any URI that can't be abbreviated this way.
#
# @param term [URI, RDF::URI, #to_s, Array<URI, RDF::URI, #to_s>]
# the URI(s) to abbreviate
# @param scalar [true, false] always returns an array if false;
# ignored if passed an array
# @param noop [true, false] whether to leave the input alone if it
# can't abbreviate
# @param sort [true, false] whether to sort the resulting array;
# meaningless if `scalar` is true
#
# @return [String, Array<String>, nil] the CURIE(s) in question
#
def abbreviate term, scalar: true, noop: true, sort: true,
prefixes: nil, vocab: nil, cache: nil
term = coerce_resources term
as = assert_uri_coercion as
cache = {} unless cache.is_a? Hash
# why was this not already like this?
prefixes ||= self.prefixes.dup
# this returns a duplicate that we can mess with
if vocab
vocab = coerce_resource vocab, as: :term
prefixes[nil] = vocab
elsif prefixes.key? nil
prefixes[nil] = coerce_resource prefixes[nil], as: :term
end
# only do this if there's something to do, cause it's expensive
# XXX also figure out a sensible way to cache this move
prefixes = sanitize_prefixes prefixes unless
prefixes.empty? or (prefixes.size == 1 and prefixes.key? nil)
# okay now merge
prefixes = @prefixes.merge prefixes
# note since hash key order is preserved this will clobber any
# explicit namespace prefix for the vocab
rev = prefixes.invert
term.map! do |t|
t = t.to_s
slug = nil # we want this value to be nil if no match and !noop
# try matching each prefix URI from longest to shortest
rev.sort do |a, b|
b.first.to_uri.to_s.length <=> a.first.to_uri.to_s.length
end.each do |vocab, pfx|
# this will start us off with the terminating slug
slug = t.delete_prefix vocab.to_s
# warn [slug, pfx].inspect
# this is saying the URI either doesn't match or abbreviates to ""
if slug == t or pfx.nil? && slug.empty?
slug = nil
else
# it's already a slug so we add a prefix if there is one
slug = '%s:%s' % [pfx, slug] unless pfx.nil?
break # we have our match
end
end
# at this point slug is either an abbreviated term or nil, so:
slug ||= t if noop
slug
end
# only sort if noop is set
term.sort! if noop && sort
scalar ? term.first : term
end
# Return the subset of prefixes, in the form of a `{ foo: "bar" }`
# {Hash}, that cover, to the extent of available prefix mappings,
# the set of terms passed in. The terms can be embedded in any
# kind of data structure that can be flattened into an {Array}.
# Elements not belonging to the {URI} (which are coerced),
# {RDF::URI}, and {RDF::Literal} (from which datatypes are
# harvested) classes are ignored. The `nil` key can be interpreted
# as the `vocab` for the given scope.
#
# @param terms [Array<RDF::URI, RDF::Literal, URI>] an array (or
# something that can ultimately be turned _into_ an array) of
# terms
#
# @return [Hash{Symbol, nil => RDF::Vocabulary::Term}] the prefix subset
#
def prefix_subset terms
# sniff out all the URIs and datatypes
terms = smush_struct terms, uris: true
# now we abbreviate all the resources
pfx = abbreviate(terms.to_a, noop: false,
sort: false, scalar: false).compact.map do |c|
c = /^(?:([^:]+):)?/.match(c).captures.first
c ? c.to_sym : c
end.uniq.to_set
# now we return the subset
@prefixes.select { |k, _| pfx.include? k }
end
# Determine if a path ends with a slash (modulo path parameters).
#
# @param uri [#to_s] the URI path
#
# @return [false, true] whether the path ends with a slash.
#
def slash? uri
uri = coerce_resource uri, as: :uri
uri.respond_to?(:path) and /\/(?:;[^\/]*)?$/.match? uri.path
end
# Clean any dodginess (`//`, `.`, `..`) out of the path, including
# path parameters. Unlike {Pathname#cleanpath} it preserves the
# trailing slash if one is present. Returns either the cleaned
# path or an array of segments. Returns nil (or empty array) if
# the URI does not respond to `#path`.
#
# @param uri [URI, RDF::URI, #to_s] the URI
# @param scalar [true, false] whether to return a string or array
# @param slash [true, false] whether to preserve a trailing slash
#
# @return [String, Array<String>, nil] the cleaned path
#
def clean_path uri, scalar: true, slash: true
uri = coerce_resource uri, as: :uri
# bail out if this isn't the kind of uri that has a path
return scalar ? nil : [] unless uri.respond_to? :path
orig = uri.path
ps = orig.split(/\/+/).map do |x|
/^([^;]*)(?:;.*)?$/.match x
end.compact.reduce([]) do |a, x|
x = x.captures.first
case x
when '' then nil
when ?. then nil
when '..' then a.pop
else a << x
end
a
end
return ps unless scalar
path = ps.join ?/
path << ?/ if slash and slash? orig
path
end
# Test if a URI path contains a UUID in its first (and only)
# segment and return it as a UUID URN.
#
# @param uri [URI, RDF::URI, #to_s] the URI
# @param as [:rdf, :uri, false, nil] how to coerce the result