diff --git a/README.md b/README.md index af6d137..3aa4067 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ If the URL supplied is a data catalog, it gets the `dataset` array and flattens #### Returns A `Promise` that resolves with an object containing: + - `catalogMetadata`: A JSON-LD object of the root data catalog provided. - `urls` - An array of strings, each being a URL for a dataset. - `errors` - An array of error objects, each containing details about errors encountered during the retrieval process. If no errors were encountered, this array is empty. Each error object includes: - `url`: The URL from which data was being fetched when the error occurred. @@ -86,7 +87,8 @@ The `errors` array it returns will detail any issues that occurred during the pr #### Returns: A `Promise` that resolves with an object containing: - - `jsonld`: An array of extracted JSON-LD objects from the datasets. + - `catalogMetadata`: A JSON-LD object of the root data catalog provided. + - `datasets`: An array of extracted JSON-LD objects from the Dataset Sites. - `errors`: An array of error objects indicating any issues encountered during fetching. Each error object includes: - `url`: The URL from which data was being fetched when the error occurred. - `status`: HTTP status code of the error response (if available). @@ -96,8 +98,8 @@ A `Promise` that resolves with an object containing: ```js const { getAllDatasets } = require('@openactive/dataset-utils'); -getAllDatasets().then(({ jsonld, errors }) => { - console.log(jsonld); +getAllDatasets().then(({ datasets, errors }) => { + console.log(datasets); // Iterating through the errors errors.forEach(error => { diff --git a/index.js b/index.js index ce6f7c2..fa50588 100644 --- a/index.js +++ b/index.js @@ -9,7 +9,7 @@ const { Parser } = require('htmlparser2'); * If the URL is not supplied, the OA Data Catalog (https://openactive.io/data-catalogs/data-catalog-collection.jsonld) is used. * * @param {string} [dataCatalogUrl] - * @returns {Promise<{urls: string[], errors: object[]}>} + * @returns {Promise<{catalogMetadata: Record[], urls: string[], errors: object[]}>} */ async function getAllDatasetSiteUrls(dataCatalogUrl = 'https://openactive.io/data-catalogs/data-catalog-collection.jsonld') { let catalog; @@ -35,16 +35,28 @@ async function getAllDatasetSiteUrls(dataCatalogUrl = 'https://openactive.io/dat const allUrls = [].concat(...datasetArraysAndErrors.map(data => data.urls)); const allErrors = [].concat(...datasetArraysAndErrors.map(data => data.errors)); - return { urls: allUrls, errors: allErrors }; + return { + catalogMetadata: catalog, + urls: allUrls, + errors: allErrors, + }; } // If the catalog has `dataset`, it does not have any further part catalogs and the datasets can be got from them if (catalog.dataset) { - return { urls: catalog.dataset, errors: [] }; + return { + catalogMetadata: catalog, + urls: catalog.dataset, + errors: [], + }; } // If the catalog has neither `hasPart` or `dataset`, return [] as it does not have the information we want - return { urls: [], errors }; + return { + catalogMetadata: catalog, + urls: [], + errors, + }; } /** @@ -88,12 +100,12 @@ function extractJSONLDfromHTML(url, html) { * If dataCatalogUrl is not supplied, the default OA Data Catalog (https://openactive.io/data-catalogs/data-catalog-collection.jsonld) is used. * * @param {string} [dataCatalogUrl] - * @returns {Promise<{jsonld: Record[], errors: string[]}>} + * @returns {Promise<{catalogMetadata: Record[],datasets: Record[],errors: string[]}>} * */ async function getAllDatasets(dataCatalogUrl = 'https://openactive.io/data-catalogs/data-catalog-collection.jsonld') { // Get Dataset URLs - const { urls: datasetUrls, errors } = await getAllDatasetSiteUrls(dataCatalogUrl); + const { catalogMetadata, urls: datasetUrls, errors } = await getAllDatasetSiteUrls(dataCatalogUrl); const jsonldFromDatasetUrls = (await Promise.all(datasetUrls.map(async (datasetUrl) => { let dataset; @@ -109,13 +121,34 @@ async function getAllDatasets(dataCatalogUrl = 'https://openactive.io/data-catal return null; } - const jsonld = extractJSONLDfromHTML(datasetUrl, dataset); - return jsonld; + try { + const jsonld = extractJSONLDfromHTML(datasetUrl, dataset); + if (!jsonld || !jsonld['@id']) { + errors.push({ + url: datasetUrl, + status: null, + message: 'Invalid JSON-LD found in dataset HTML - it did not contain `@id`.', + }); + return null; + } + return jsonld; + } catch (error) { + errors.push({ + url: datasetUrl, + status: null, + message: error.message, + }); + return null; + } }))) // Filter out datasets that do not have valid dataset .filter(x => !!x); - return { jsonld: jsonldFromDatasetUrls, errors }; + return { + catalogMetadata, + datasets: jsonldFromDatasetUrls, + errors, + }; } /** @@ -188,7 +221,7 @@ async function axiosGetWithRetryForKnownLegendIssue(url) { for (let attempt = 0; attempt < maxRetries; attempt += 1) { try { - response = await axios.get(url); + response = await axios.get(url, { timeout: 60000 }); break; // Exit the loop if the request was successful } catch (error) { if (error.response && error.response.status === 403 && attempt < maxRetries - 1) { diff --git a/test/getAllDatasets-test.js b/test/getAllDatasets-test.js index 3b887db..fec37ff 100644 --- a/test/getAllDatasets-test.js +++ b/test/getAllDatasets-test.js @@ -25,9 +25,11 @@ describe('getAllDatasets()', function () { }); // Test - const { jsonld: datasets } = await getAllDatasets('https://openactive.io/data-catalogs/example-data-catalog-collection.jsonld'); + const { datasets, errors } = await getAllDatasets('https://openactive.io/data-catalogs/example-data-catalog-collection.jsonld'); // Assertions + expect(errors).to.be.an('array'); + expect(errors.length).to.equal(0); expect(datasets).to.be.an('array'); expect(datasets.length).to.be.above(0); expect(datasets[0]).to.be.an('object');