diff --git a/CHANGELOG.md b/CHANGELOG.md index 5039de4c..1d5febaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,8 @@ - `omitGraph` based on processingMode. - Replaced `removePreserve` with `cleanupPreserve` and `cleanupNulls`. - Remove unused framing `graphStack` code that was removed from the spec. +- Update calls to `documentLoader` to pass options. +- Pass `requestProfile` in `Accept` header when loading documents. ### Added - Support for `"@import"`. @@ -74,6 +76,7 @@ - Top level `@graph` omitted if `omitGraph` is `true`. - Check for invalid values of `@embed`. - Support default values for `@type` when framing. +- Support for extracting JSON-LD from HTML, when the xmldom package is loaded. ## 2.0.2 - 2020-01-17 diff --git a/lib/ContextResolver.js b/lib/ContextResolver.js index e70ba98a..83d97ccf 100644 --- a/lib/ContextResolver.js +++ b/lib/ContextResolver.js @@ -163,7 +163,7 @@ module.exports = class ContextResolver { let remoteDoc; try { - remoteDoc = await documentLoader(url); + remoteDoc = await documentLoader(url, {}); context = remoteDoc.document || null; // parse string context as JSON if(_isString(context)) { diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js index 88439b3b..f6022b6d 100644 --- a/lib/documentLoaders/node.js +++ b/lib/documentLoaders/node.js @@ -3,7 +3,12 @@ */ 'use strict'; -const {parseLinkHeader, buildHeaders} = require('../util'); +const contentType = require('content-type'); + +const { + parseLinkHeader, + buildHeaders +} = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); const RequestQueue = require('../RequestQueue'); @@ -38,11 +43,11 @@ module.exports = ({ const http = require('http'); const queue = new RequestQueue(); - return queue.wrapLoader(function(url) { - return loadDocument(url, []); + return queue.wrapLoader(function(url, options) { + return loadDocument(url, options, []); }); - async function loadDocument(url, redirects) { + async function loadDocument(url, options, redirects) { if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) { throw new JsonLdError( 'URL could not be dereferenced; only "http" and "https" URLs are ' + @@ -61,6 +66,12 @@ module.exports = ({ return doc; } + // add any optional requestProfile + if(options.requestProfile) { + headers.Accept = + headers.Accept + `, application/ld+json;profile=${options.requestProfile}`; + } + let result; let alternate = null; try { @@ -78,8 +89,17 @@ module.exports = ({ } const {res, body} = result; + const {type, parameters} = contentType.parse(res); + + doc = { + contextUrl: null, + documentUrl: url, + document: body || null, + contentType: type, + profile: parameters.profile + }; - doc = {contextUrl: null, documentUrl: url, document: body || null}; + // separate profile from content-type // handle error const statusText = http.STATUS_CODES[res.statusCode]; @@ -95,7 +115,7 @@ module.exports = ({ // handle Link Header if(res.headers.link && - res.headers['content-type'] !== 'application/ld+json') { + contentType !== 'application/ld+json') { // only 1 related link header permitted const linkHeaders = parseLinkHeader(res.headers.link); const linkedContext = linkHeaders[LINK_HEADER_CONTEXT]; @@ -144,7 +164,7 @@ module.exports = ({ }); } redirects.push(url); - return loadDocument(res.headers.location, redirects); + return loadDocument(res.headers.location, options, redirects); } // cache for each redirected URL diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js index f1c53e74..b4e6223d 100644 --- a/lib/documentLoaders/xhr.js +++ b/lib/documentLoaders/xhr.js @@ -3,7 +3,12 @@ */ 'use strict'; -const {parseLinkHeader, buildHeaders} = require('../util'); +const contentType = require('content-type'); + +const { + parseLinkHeader, + buildHeaders +} = require('../util'); const {LINK_HEADER_CONTEXT} = require('../constants'); const JsonLdError = require('../JsonLdError'); const RequestQueue = require('../RequestQueue'); @@ -31,7 +36,7 @@ module.exports = ({ const queue = new RequestQueue(); return queue.wrapLoader(loader); - async function loader(url) { + async function loader(url, options) { if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) { throw new JsonLdError( 'URL could not be dereferenced; only "http" and "https" URLs are ' + @@ -45,6 +50,12 @@ module.exports = ({ 'jsonld.InvalidUrl', {code: 'loading document failed', url}); } + // add any optional requestProfile + if(options.requestProfile) { + headers.Accept = + headers.Accept + `, application/ld+json;profile=${options.requestProfile}`; + } + let req; try { req = await _get(xhr, url, headers); @@ -65,13 +76,21 @@ module.exports = ({ }); } - let doc = {contextUrl: null, documentUrl: url, document: req.response}; + const {type, parameters} = contentType.parse(req); + + let doc = { + contextUrl: null, + documentUrl: url, + document: req.response, + contentType: type, + profile: parameters.profile + }; let alternate = null; // handle Link Header (avoid unsafe header warning by existence testing) - const contentType = req.getResponseHeader('Content-Type'); let linkHeader; - if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) { + if(contentType !== 'application/ld+json' && + REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) { linkHeader = req.getResponseHeader('Link'); } if(linkHeader && contentType !== 'application/ld+json') { diff --git a/lib/frame.js b/lib/frame.js index 0a10f217..27675fb7 100644 --- a/lib/frame.js +++ b/lib/frame.js @@ -231,9 +231,10 @@ api.frame = (state, subjects, frame, parent, property = null) => { // recurse into list if(graphTypes.isList(o)) { - const subframe = (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ? - frame[prop][0]['@list'] : - _createImplicitFrame(flags); + const subframe = + (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ? + frame[prop][0]['@list'] : + _createImplicitFrame(flags); // add empty list const list = {'@list': []}; diff --git a/lib/jsonld.js b/lib/jsonld.js index ebd91bf5..1d1a6d32 100644 --- a/lib/jsonld.js +++ b/lib/jsonld.js @@ -34,6 +34,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ const canonize = require('rdf-canonize'); +const contentType = require('content-type'); const util = require('./util'); const ContextResolver = require('./ContextResolver'); const IdentifierIssuer = util.IdentifierIssuer; @@ -42,6 +43,7 @@ const LRU = require('lru-cache'); const NQuads = require('./NQuads'); const Rdfa = require('./Rdfa'); +const {prependBase: _prependBase} = require('./url'); const {expand: _expand} = require('./expand'); const {flatten: _flatten} = require('./flatten'); const {fromRDF: _fromRDF} = require('./fromRdf'); @@ -378,6 +380,7 @@ jsonld.flatten = async function(input, ctx, options) { // set default options options = _setDefaults(options, { base: _isString(input) ? input : '', + extractAllScripts: true, contextResolver: new ContextResolver( {sharedCache: _resolvedContextCache}) }); @@ -663,6 +666,7 @@ jsonld.toRDF = async function(input, options) { // set default options options = _setDefaults(options, { base: _isString(input) ? input : '', + extractAllScripts: true, skipExpansion: false, contextResolver: new ContextResolver( {sharedCache: _resolvedContextCache}) @@ -862,6 +866,9 @@ jsonld.documentLoader = async url => { * @param url the URL to fetch. * @param [options] the options to use: * [documentLoader] the document loader to use. + * [extractAllScripts] concatenates all matching script elements. + * [profile] used when selecting from HTML script elements. + * [requestProfile] one or more profile IRIs to use in the request. * * @return a Promise that resolves to the retrieved remote document. */ @@ -873,7 +880,10 @@ jsonld.get = async function(url, options) { load = jsonld.documentLoader; } - const remoteDoc = await load(url); + // FIXME: unescape frag? + const [reference, frag] = url.split('#', 2); + + const remoteDoc = await load(reference, options); try { if(!remoteDoc.document) { @@ -882,9 +892,74 @@ jsonld.get = async function(url, options) { 'jsonld.NullRemoteDocument'); } if(_isString(remoteDoc.document)) { - remoteDoc.document = JSON.parse(remoteDoc.document); + if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) { + const domParser = new jsonld.domParser(); + const dom = domParser.parseFromString(remoteDoc.document); + + // Use any document base + const baseElem = dom.getElementsByTagName('base'); + if(baseElem.length > 0) { + const href = baseElem[0].getAttribute('href'); + options.base = _prependBase(options.base || reference, href); + } + + const scripts = dom.getElementsByTagName('script'); + remoteDoc.document = []; + + for(let i = 0; i < scripts.length; i++) { + const script = scripts[i]; + // only application/ld+json + const {type} = contentType.parse(script.getAttribute('type')); + if(type !== 'application/ld+json') { + continue; + } + if(!script.getAttribute('type').startsWith('application/ld+json')) { + continue; + } + // If url has a fragment identifier, only matching scripts + if(frag && script.getAttribute('id') !== frag) { + continue; + } + try { + remoteDoc.document.push(JSON.parse(script.textContent)); + } catch(e) { + throw new JsonLdError( + 'Illegal script content.', + 'jsonld.InvalidScriptElement', { + code: 'invalid script element', + remoteDoc + }); + } + } + if(frag && remoteDoc.document.length === 0) { + throw new JsonLdError( + `No script tag found with id=${frag}.`, + 'jsonld.InvalidScriptElement', { + code: 'loading document failed', + remoteDoc + }); + } + if(frag || !options.extractAllScripts) { + if(!remoteDoc.document[0]) { + throw new JsonLdError( + `No script tag found.`, + 'jsonld.InvalidScriptElement', { + code: 'loading document failed', + remoteDoc + }); + } + remoteDoc.document = remoteDoc.document[0]; + } + } else { + remoteDoc.document = JSON.parse(remoteDoc.document); + } } } catch(e) { + if(e.name === 'jsonld.InvalidScriptElement') { + // pass error detected in HTML decode + throw (e); + } + // otherwise, general loading error throw new JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', { @@ -942,6 +1017,27 @@ jsonld.documentLoaders = {}; jsonld.documentLoaders.node = require('./documentLoaders/node'); jsonld.documentLoaders.xhr = require('./documentLoaders/xhr'); +// Optional DOM parser +try { + jsonld.domParser = require('xmldom').DOMParser || class NoDOMParser { + parseFromString() { + throw new JsonLdError( + 'Could not parse HTML document. ' + + 'HTML parsing not implemented.', 'jsonld.LoadDocumentError', + {code: 'loading document failed'}); + } + }; +} catch(e) { + jsonld.domParser = class NoDOMParser { + parseFromString() { + throw new JsonLdError( + 'Could not parse HTML document. ' + + 'HTML parsing not implemented.', 'jsonld.LoadDocumentError', + {code: 'loading document failed'}); + } + }; +} + /** * Assigns the default document loader for external document URLs to a built-in * default. Supported types currently include: 'xhr' and 'node'. diff --git a/lib/util.js b/lib/util.js index 77da8f61..c07f7669 100644 --- a/lib/util.js +++ b/lib/util.js @@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/; const REGEX_LINK_HEADER_PARAMS = /(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g; +// FIXME: conditinally support text/html const DEFAULTS = { headers: { accept: 'application/ld+json, application/json' diff --git a/package.json b/package.json index adb65fab..0505df57 100644 --- a/package.json +++ b/package.json @@ -31,6 +31,7 @@ ], "dependencies": { "canonicalize": "^1.0.1", + "content-type": "^1.0.4", "lru-cache": "^5.1.1", "object.fromentries": "^2.0.2", "rdf-canonize": "^1.0.2", diff --git a/tests/test-common.js b/tests/test-common.js index b51c7555..1296f2ca 100644 --- a/tests/test-common.js +++ b/tests/test-common.js @@ -25,6 +25,15 @@ const manifest = options.manifest || { filename: '/' }; +let htmlSupport; +try { + // xmldom may load but not have a DOMParser + htmlSupport = !!require('xmldom').DOMParser; +} catch(e) { + htmlSupport = false; +} +console.log('HTML Support: ' + htmlSupport); + const TEST_TYPES = { 'jld:CompactTest': { skip: { @@ -35,11 +44,6 @@ const TEST_TYPES = { // NOTE: idRegex format: //MMM-manifest#tNNN$/, idRegex: [ - // html - /html-manifest#tc001$/, - /html-manifest#tc002$/, - /html-manifest#tc003$/, - /html-manifest#tc004$/, ] }, fn: 'compact', @@ -63,33 +67,8 @@ const TEST_TYPES = { /expand-manifest#t0129$/, // html - /html-manifest#te001$/, - /html-manifest#te002$/, - /html-manifest#te003$/, - /html-manifest#te004$/, - /html-manifest#te005$/, - /html-manifest#te006$/, - /html-manifest#te007$/, - /html-manifest#te010$/, - /html-manifest#te011$/, - /html-manifest#te012$/, - /html-manifest#te013$/, - /html-manifest#te014$/, - /html-manifest#te015$/, - /html-manifest#te016$/, - /html-manifest#te017$/, - /html-manifest#te018$/, - /html-manifest#te019$/, - /html-manifest#te020$/, - /html-manifest#te021$/, - /html-manifest#te022$/, - /html-manifest#tex01$/, - // HTML extraction - /expand-manifest#thc01$/, - /expand-manifest#thc02$/, - /expand-manifest#thc03$/, - /expand-manifest#thc04$/, - /expand-manifest#thc05$/, + /html-manifest#tex01$/, // XHTML + /html-manifest#te010$/, // unescaped content // remote /remote-doc-manifest#t0013$/, // HTML ] @@ -110,11 +89,6 @@ const TEST_TYPES = { // NOTE: idRegex format: //MMM-manifest#tNNN$/, idRegex: [ - // html - /html-manifest#tf001$/, - /html-manifest#tf002$/, - /html-manifest#tf003$/, - /html-manifest#tf004$/, ] }, fn: 'flatten', @@ -189,26 +163,7 @@ const TEST_TYPES = { /toRdf-manifest#twf05$/, // html - /html-manifest#tr001$/, - /html-manifest#tr002$/, - /html-manifest#tr003$/, - /html-manifest#tr004$/, - /html-manifest#tr005$/, - /html-manifest#tr006$/, - /html-manifest#tr007$/, - /html-manifest#tr010$/, - /html-manifest#tr011$/, - /html-manifest#tr012$/, - /html-manifest#tr013$/, - /html-manifest#tr014$/, - /html-manifest#tr015$/, - /html-manifest#tr016$/, - /html-manifest#tr017$/, - /html-manifest#tr018$/, - /html-manifest#tr019$/, - /html-manifest#tr020$/, - /html-manifest#tr021$/, - /html-manifest#tr022$/, + /html-manifest#tr010$/, // unescaped content // Invalid Statement /toRdf-manifest#te075$/, /toRdf-manifest#te111$/, @@ -439,6 +394,13 @@ function addTest(manifest, test, tests) { self.skip(); } + // if xmldom not loaded, skip HTML tests + if(isJsonLdType(test, 'jld:HtmlTest') && !htmlSupport) { + console.log('Skipping test due to lack of HTML support:', + {id: test['@id'], name: test.name}); + self.skip(); + } + // skip based on test type if(isJsonLdType(test, SKIP_TESTS)) { if(options.verboseSkip) { @@ -894,11 +856,11 @@ function createDocumentLoader(test) { 'https://w3c.github.io/json-ld-api/tests', 'https://w3c.github.io/json-ld-framing/tests' ]; - const localLoader = function(url) { + const localLoader = function(url, options) { // always load remote-doc tests remotely in node // NOTE: disabled due to github pages issues. //if(options.nodejs && test.manifest.name === 'Remote document') { - // return jsonld.documentLoader(url); + // return jsonld.documentLoader(url, options); //} // FIXME: this check only works for main test suite and will not work if: @@ -915,25 +877,34 @@ function createDocumentLoader(test) { } // load remotely - return jsonld.documentLoader(url); + return jsonld.documentLoader(url, options); }; return localLoader; function loadLocally(url) { - const doc = {contextUrl: null, documentUrl: url, document: null}; - const options = test.option; + const doc = { + contextUrl: null, + documentUrl: url, + document: null, + contentType: null, + profile: null + }; + const options = test.option || {}; + doc.contentType = options.contentType; + if(!doc.contentType && url.indexOf('.jsonld', url.length - 7) !== -1) { + doc.contentType = 'application/ld+json'; + } + if(!doc.contentType && url.indexOf('.json', url.length - 5) !== -1) { + doc.contentType = 'application/json'; + } + if(!doc.contentType && url.indexOf('.html', url.length - 5) !== -1) { + doc.contentType = 'text/html'; + } if(options && url === test.base) { if('redirectTo' in options && parseInt(options.httpStatus, 10) >= 300) { doc.documentUrl = test.manifest.baseIri + options.redirectTo; } else if('httpLink' in options) { - let contentType = options.contentType || null; - if(!contentType && url.indexOf('.jsonld', url.length - 7) !== -1) { - contentType = 'application/ld+json'; - } - if(!contentType && url.indexOf('.json', url.length - 5) !== -1) { - contentType = 'application/json'; - } let linkHeader = options.httpLink; if(Array.isArray(linkHeader)) { linkHeader = linkHeader.join(','); @@ -941,7 +912,7 @@ function createDocumentLoader(test) { const linkHeaders = jsonld.parseLinkHeader(linkHeader); const linkedContext = linkHeaders['http://www.w3.org/ns/json-ld#context']; - if(linkedContext && contentType !== 'application/ld+json') { + if(linkedContext && doc.contentType !== 'application/ld+json') { if(Array.isArray(linkedContext)) { throw {name: 'multiple context link headers'}; } @@ -951,7 +922,8 @@ function createDocumentLoader(test) { // If not JSON-LD, alternate may point there if(linkHeaders['alternate'] && linkHeaders['alternate'].type == 'application/ld+json' && - !(contentType || '').match(/^application\/(\w*\+)?json$/)) { + !(doc.contentType || '').match(/^application\/(\w*\+)?json$/)) { + doc.contentType = 'application/ld+json'; doc.documentUrl = prependBase(url, linkHeaders['alternate'].target); } } @@ -975,12 +947,22 @@ function createDocumentLoader(test) { }); } - return p.then(readJson).then(json => { - doc.document = json; - return doc; - }).catch(() => { - throw {name: 'loading document failed', url}; - }); + // parse JSON, if appropriate + if(!doc.contentType || doc.contentType.includes('json')) { + return p.then(readJson).then(json => { + doc.document = json; + return doc; + }).catch(() => { + throw {name: 'loading document failed', url}; + }); + } else { + return p.then(readFile).then(content => { + doc.document = content; + return doc; + }).catch(() => { + throw {name: 'loading document failed', url}; + }); + } } }