From 6509227d370aadfa4fa1970e14da1abe5f182b69 Mon Sep 17 00:00:00 2001 From: Sam Cox Date: Tue, 23 Apr 2024 12:19:02 +0100 Subject: [PATCH 1/2] Add example of the process for updating a schema to the README. --- README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b94eb5e..3cf9bfc 100644 --- a/README.md +++ b/README.md @@ -1 +1,64 @@ -New collection of schemas for Mk2 \ No newline at end of file +New collection of schemas for Mk2 + + +## Modifying a schema + +The below is a walkthrough of the steps required to make a change to an existing schema, by way of +an example. Other changes will require slightly different steps. + +In this example, we wish to modify the type of a field (`accessibility.formatAndStandards.conformsTo`) +in the HDRUK 2.2.1 schema. + +### Modify schema files + +The schema definition in `hdr_schemata/models/HDRUK/2.2.1/schema.json` is +generated from the contents of `hdr_schemata/models/HDRUK/v2_2_1`, building +upon the contents of previous versions. `v2_1_2 is the "base model" all +current schemas are derived from. + +We create a new `FormatAndStandards.py` in the +`v2_2_1` directory, with the boilerplate taken from `v2_1_2/FormatAndStandards.py`. Into this file we import +```python +from hdr_schemata.models.HDRUK.v2_2_0 import ( + FormatAndStandards as BaseFormatAndStandards, +) +``` +Note that this is from the previous version, which itself imports from `v2_1_2`. Ensure that the `v2_1_2/__init__.py` correctly exports the class. + +Overload the class, using the imported class as base: +```python +class FormatAndStandards(BaseFormatAndStandards): + conformsTo: Optional[List[StandardisedDataModels]] = Field( + ..., **an.vocabularyEncodingScheme.__dict__ + ) +``` + +Now import this file into `v2_2_1/Accessibility.py`, and overload the field there with the new type: + +```python +class Accessibility(BaseAccessibility): + access: Access = Field(..., description=an.description, title=an.title) + + formatAndStandards: Optional[FormatAndStandards] = Field( + None, + title=an.formatAndStandards.title, + description=an.formatAndStandards.description, + ) +``` + +The changes to the schema are now ready to process. + +### Create schema json + +Run `python create_json_schema.py`. This will modify the contents of `hdr_schemata/models/HDRUK/2.2.1/schema.json` as appropriate. You may need to install the repo as a local package: +```bash +pip install -e . +``` + +### Update docs + +Finally, update the Markdown docs with: +```bash +python hdr_schemata/utils/create_markdown.py +``` +to generate the Markdown in `docs/`. After merging, this will be available at `https://hdruk.github.io/schemata-2/`. \ No newline at end of file From d2d444cd4370c028e330db142d1799accea5e1e0 Mon Sep 17 00:00:00 2001 From: Branwen Snelling Date: Thu, 2 May 2024 17:30:48 +0100 Subject: [PATCH 2/2] update json flattening and run for hdruk 2.2.1 --- docs/HDRUK/2.2.1.form.json | 1355 +++++++++++++++++++++++++ docs/HDRUK/2.2.1.md | 114 +-- docs/HDRUK/2.2.1.structure.json | 38 +- hdr_schemata/utils/create_markdown.py | 63 +- 4 files changed, 1481 insertions(+), 89 deletions(-) create mode 100644 docs/HDRUK/2.2.1.form.json diff --git a/docs/HDRUK/2.2.1.form.json b/docs/HDRUK/2.2.1.form.json new file mode 100644 index 0000000..54dae6f --- /dev/null +++ b/docs/HDRUK/2.2.1.form.json @@ -0,0 +1,1355 @@ +{ + "schema_fields": [ + { + "required": true, + "title": "Dataset identifier", + "description": "System dataset identifier", + "examples": [ + "226fb3f1-4471-400a-8c39-2b66d46a39b6", + "https://web.www.healthdatagateway.org/dataset/226fb3f1-4471-400a-8c39-2b66d46a39b6" + ], + "is_list": false, + "is_optional": true, + "types": { + "maxLength": 36, + "minLength": 36, + "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$", + "title": "Uuidv4", + "type": "string" + }, + "location": "identifier" + }, + { + "required": true, + "title": "Dataset Version", + "description": "Dataset metadata version", + "examples": [ + "1.1.0" + ], + "is_list": false, + "is_optional": false, + "types": { + "pattern": "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$", + "title": "Semver", + "type": "string" + }, + "location": "version" + }, + { + "required": true, + "title": "revision version", + "description": "Version number used for previous version of this dataset", + "examples": [ + "6.0.0" + ], + "is_list": false, + "is_optional": false, + "types": { + "pattern": "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$", + "title": "Semver", + "type": "string" + }, + "location": "revisions.version" + }, + { + "required": true, + "title": "revision url", + "description": "Some url with a reference to the record of a previous version of this dataset", + "examples": [ + "https://api.service.nhs.uk/health-research-data-catalogue/datasetrevisions/841f7da2-b018-41f6-b4ae-2e0aadab6561" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "Url", + "format": "uri", + "minLength": 1, + "type": "string" + }, + "location": "revisions.url" + }, + { + "required": true, + "title": "Metadata Version Revisions", + "description": "A list of persistent identifiers and version numbers for previous versions of metadata for this dataset", + "examples": null, + "is_list": true, + "is_optional": false, + "types": "Revision", + "location": "revisions" + }, + { + "required": true, + "title": "Metadata Issued Datetime',", + "description": "Datetime stamp of when this metadata version was initially issued", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "datetime", + "location": "issued" + }, + { + "required": true, + "title": "Last Modified Datetime", + "description": "Datetime stamp of when this metadata was last modified", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "datetime", + "location": "modified" + }, + { + "required": true, + "title": "Title", + "description": "Title of the dataset limited to 150 characters. It should provide a short description of the dataset and be unique across the gateway. If your title is not unique, please add a prefix with your organisation name or identifier to differentiate it from other datasets within the Gateway. Please avoid acronyms wherever possible. Good titles should summarise the content of the dataset and if relevant, the region the dataset covers.", + "examples": [ + "North West London COVID-19 Patient Level Situation Report" + ], + "is_list": false, + "is_optional": false, + "types": { + "maxLength": 150, + "minLength": 2, + "title": "OneHundredFiftyCharacters", + "type": "string" + }, + "location": "summary.title" + }, + { + "required": true, + "title": "Dataset Abstract", + "description": "Provide a clear and brief descriptive signpost for researchers who are searching for data that may be relevant to their research. The abstract should allow the reader to determine the scope of the data collection and accurately summarise its content. The optimal length is one paragraph (limited to 255 characters) and effective abstracts should avoid long sentences and abbreviations where possible", + "examples": [ + "CPRD Aurum contains primary care data contributed by General Practitioner (GP) practices using EMIS Web\u00ae including patient registration information and all care events that GPs have chosen to record as part of their usual medical practice." + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "AbstractText", + "maxLength": 500, + "minLength": 5, + "type": "string" + }, + "location": "summary.abstract" + }, + { + "required": false, + "title": "Organisation Identifier", + "description": "Please provide a Grid.ac identifier (see https://www.grid.ac/institutes) for your organisation. If your organisation does not have a Grid.ac identifier please use the \u201csuggest and institute\u201d function here: https://www.grid.ac/institutes#", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Url", + "format": "uri", + "minLength": 1, + "type": "string" + }, + "location": "summary.publisher.identifier" + }, + { + "required": true, + "title": "Organisation Name", + "description": "Name of the organisation", + "examples": null, + "is_list": false, + "is_optional": false, + "types": { + "maxLength": 150, + "minLength": 2, + "title": "OneHundredFiftyCharacters", + "type": "string" + }, + "location": "summary.publisher.name" + }, + { + "required": false, + "title": "Organisation Logo", + "description": "Please provide a logo associated with the Gateway Organisation using a valid URL. The following formats will be accepted .jpg, .png or .svg.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Url", + "format": "uri", + "minLength": 1, + "type": "string" + }, + "location": "summary.publisher.logo" + }, + { + "required": false, + "title": "Organisation Description", + "description": "Please provide a URL that describes the organisation.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Description", + "maxLength": 10000, + "minLength": 2, + "type": "string" + }, + "location": "summary.publisher.description" + }, + { + "required": true, + "title": "Organisation Contact Point", + "description": "Organisation contact point(s)", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "EmailAddress", + "format": "email", + "type": "string" + }, + "location": "summary.publisher.contactPoint" + }, + { + "required": false, + "title": "Organisation Membership", + "description": "Please indicate if the organisation is an Alliance Member or a Hub.'", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "HUB", + "ALLIANCE", + "OTHER", + "NCS" + ] + }, + "location": "summary.publisher.memberOf" + }, + { + "required": true, + "title": "Dataset publisher", + "description": "This is the organisation responsible for running or supporting the data access request process, as well as publishing and maintaining the metadata. In most this will be the same as the HDR UK Organisation (Hub or Alliance Member). However, in some cases this will be different i.e. Tissue Directory are an HDR UK Gateway organisation but coordinate activities across a number of data publishers i.e. Cambridge Blood and Stem Cell Biobank.", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "Organisation", + "location": "summary.publisher" + }, + { + "required": true, + "title": "Contact Point", + "description": "Please provide a valid email address that can be used to coordinate data access requests with the publisher. Organisations are expected to provide a dedicated email address associated with the data access request process. Notes- An employee's email address can only be provided on a temporary basis and if one is provided an explicit consent must be obtained for this purpose. Gateway Feature: If no contact point is provided in this field, this field will be defaulted to the teams support email provided in the teams setting.", + "examples": [ + "SAILDatabank@swansea.ac.uk" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "EmailAddress", + "format": "email", + "type": "string" + }, + "location": "summary.contactPoint" + }, + { + "required": true, + "title": "Keywords", + "description": "Please provide a list of relevant and specific keywords that can improve the SEO of your dataset as a comma separated list. Notes: Onboarding portal will suggest keywords based on title, abstract and description. We are compiling a standardised list of keywords and synonyms across datasets to make filtering easier for users.", + "examples": [ + "Preprints,Papers,HDR UK" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "summary.keywords" + }, + { + "required": false, + "title": "Alternate dataset identifiers", + "description": "Alternate dataset identifiers or local identifiers", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "summary.alternateIdentifiers" + }, + { + "required": false, + "title": "DOI Name", + "description": "DOI associated to this dataset", + "examples": [ + "10.1093/ije/dyx196" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "Doi", + "pattern": "^10.\\d{4,9}/[-._;()/:a-zA-Z0-9]+$", + "type": "string" + }, + "location": "summary.doiName" + }, + { + "required": true, + "title": "Dataset Type", + "description": "Placeholder for dataset type\"", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "DatasetType", + "maxLength": 100, + "minLength": 2, + "type": "string" + }, + "location": "summary.datasetType" + }, + { + "required": true, + "title": "Dataset Sub-type", + "description": "Placeholder for dataset sub-type", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "DatasetType", + "maxLength": 100, + "minLength": 2, + "type": "string" + }, + "location": "summary.datasetSubType" + }, + { + "required": true, + "title": "Population size", + "description": "Summary population size of the cohort", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "int", + "location": "summary.populationSize" + }, + { + "required": false, + "title": "Description", + "description": "A free-text description of the dataset. Gateway Feature: Keywords and text may be extracted out of the description and index for search", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Description", + "maxLength": 10000, + "minLength": 2, + "type": "string" + }, + "location": "documentation.description" + }, + { + "required": false, + "title": "Associated Media", + "description": "Please provide any media associated with the Gateway Organisation using a valid URI for the content. This is an opportunity to provide additional context that could be useful for researchers wanting to understand more about the dataset and its relevance to their research question. The following formats will be accepted .jpg, .png or .svg, .pdf, .xslx or .docx. Note: media asset can be hosted by the organisation or uploaded using the onboarding portal.", + "examples": [ + "PDF Document that describes study protocol" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "documentation.associatedMedia" + }, + { + "required": false, + "title": "Group", + "description": "Please complete only if the dataset is part of a group or family", + "examples": [ + "Hospital Episodes Statistics datasets (A&E, APC, OP, AC MSDS)." + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "documentation.isPartOf" + }, + { + "required": false, + "title": "Geographic Coverage", + "description": "The geographical area covered by the dataset. It is recommended that links are to entries in a well-maintained gazetteer such as https://www.geonames.org/ or https://what3words.com/daring.lion.race.", + "examples": [ + "https://www.geonames.org/2635167/united-kingdom-of-great-britain-and-northern-ireland.html" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "coverage.spatial" + }, + { + "required": false, + "title": "Age Range", + "description": "Please indicate the age range in whole years of participants in the dataset. Please provide range in the following format '[min age] \u2013 [max age]' where both the minimum and maximum are whole numbers (integers).", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "AgeRange", + "pattern": "Not Known|(150|1[0-4][0-9]|[0-9]|[1-8][0-9]|9[0-9])-(150|1[0-4][0-9]|[0-9]|[1-8][0-9]|9[0-9])", + "type": "string" + }, + "location": "coverage.typicalAgeRange" + }, + { + "required": false, + "title": "Followup", + "description": "If known, what is the typical time span that a patient appears in the dataset (follow up period)", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "0 - 6 MONTHS", + "6 - 12 MONTHS", + "1 - 10 YEARS", + "> 10 YEARS", + "UNKNOWN", + "CONTINUOUS", + "OTHER", + null + ] + }, + "location": "coverage.followup" + }, + { + "required": false, + "title": "Pathway", + "description": "Please indicate if the dataset is representative of the patient pathway and any limitations the dataset may have with respect to pathway coverage. This could include if the dataset is from a single speciality or area, a single tier of care, linked across two tiers (e.g. primary and secondary care), or an integrated care record covering the whole patient pathway.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Description", + "maxLength": 10000, + "minLength": 2, + "type": "string" + }, + "location": "coverage.pathway" + }, + { + "required": false, + "title": "Gender", + "description": "Male, Female, Other", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Male", + "Female", + "Other" + ] + }, + "location": "coverage.gender" + }, + { + "required": false, + "title": "Biological Samples", + "description": "Blood, Saliva, Urine, Other", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Blood", + "Other", + "Urine", + "Saliva" + ] + }, + "location": "coverage.biologicalsamples" + }, + { + "required": false, + "title": "Psychological", + "description": "Mental health, Cognitive function", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Cognitive Function", + "Mental Health" + ] + }, + "location": "coverage.psychological" + }, + { + "required": false, + "title": "Physical", + "description": "Cardiovascular, Respiratory, Musculoskeletal, Hearing and Vision, Reproductive", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Respiratory", + "Vision", + "Hearing", + "Musculoskeletal", + "Cardiovascular", + "Reproductive" + ] + }, + "location": "coverage.physical" + }, + { + "required": false, + "title": "Anthropometric", + "description": "Height, Weight, Waist circumference, Hip circumference, Blood pressure", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Blood Pressure", + "Hip Circumference", + "Height", + "Waist Circumference", + "Weight" + ] + }, + "location": "coverage.anthropometric" + }, + { + "required": false, + "title": "Lifestyle", + "description": "Cohort lifestyle habits: Smoking, Physical activity, Dietary habits, Alcohol", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Smoking", + "Dietary Habits", + "Physical Activity", + "Alcohol" + ] + }, + "location": "coverage.lifestyle" + }, + { + "required": false, + "title": "Socio-economic", + "description": "Occupation, Family circumstances, Housing, Education, Ethnic group, Martial status, Social support", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Finances", + "Family Circumstances", + "Housing", + "Education", + "Marital Status", + "Occupation", + "Ethnic Group", + "Social Support" + ] + }, + "location": "coverage.socioeconomic" + }, + { + "required": false, + "title": "Purpose", + "description": "Please indicate the purpose(s) that the dataset was collected.", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "STUDY", + "DISEASE REGISTRY", + "TRIAL", + "CARE", + "AUDIT", + "ADMINISTRATIVE", + "FINANCIAL", + "STATUTORY", + "OTHER", + null + ] + }, + "location": "provenance.origin.purpose" + }, + { + "required": false, + "title": "Source", + "description": "Please indicate the source of the data extraction", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "EPR", + "ELECTRONIC SURVEY", + "LIMS", + "OTHER INFORMATION SYSTEM", + "PAPER BASED", + "FREETEXT NLP", + "MACHINE GENERATED", + "OTHER" + ] + }, + "location": "provenance.origin.source" + }, + { + "required": false, + "title": "Collection Situation Setting", + "description": "Please indicate the setting(s) where data was collected. Multiple settings may be provided", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "CLINIC", + "PRIMARY CARE", + "ACCIDENT AND EMERGENCY", + "OUTPATIENTS", + "IN-PATIENTS", + "SERVICES", + "COMMUNITY", + "HOME", + "PRIVATE", + "PHARMACY", + "SOCIAL CARE", + "LOCAL AUTHORITY", + "NATIONAL GOVERNMENT", + "OTHER" + ] + }, + "location": "provenance.origin.collectionSituation" + }, + { + "required": false, + "title": "Release Date", + "description": "Date of the latest release of the dataset. If this is a regular release i.e. quarterly, or this is a static dataset please complete this alongside Periodicity. If this is Irregular or Continuously released please leave this blank. Notes: Periodicity and release date will be used to determine when the next release is expected. E.g. if the release date is documented as 01/01/2020 and it is now 20/04/2020 and there is a quarterly release schedule, the latest release will be calculated as 01/04/2020.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "date", + "location": "provenance.temporal.distributionReleaseDate" + }, + { + "required": true, + "title": "Start Date", + "description": "The start of the time period that the dataset provides coverage for. If there are multiple cohorts in the dataset with varying start dates, please provide the earliest date and use the description or the media attribute to provide more information.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "date", + "location": "provenance.temporal.startDate" + }, + { + "required": false, + "title": "End Date", + "description": "The end of the time period that the dataset provides coverage for. If the dataset is \u201cContinuous\u201d and has no known end date, please state continuous. If there are multiple cohorts in the dataset with varying end dates, please provide the latest date and use the description or the media attribute to provide more information.'", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "date", + "location": "provenance.temporal.endDate" + }, + { + "required": true, + "title": "Time Lag", + "description": "Please indicate the typical time-lag between an event and the data for that event appearing in the dataset", + "examples": null, + "is_list": false, + "is_optional": false, + "types": { + "type": "string", + "options": [ + "LESS 1 WEEK", + "1-2 WEEKS", + "2-4 WEEKS", + "1-2 MONTHS", + "2-6 MONTHS", + "MORE 6 MONTHS", + "VARIABLE", + "NO TIMELAG", + "NOT APPLICABLE", + "OTHER", + null + ] + }, + "location": "provenance.temporal.timeLag" + }, + { + "required": true, + "title": "Publishing Frequency", + "description": "Please indicate the frequency of distribution release. If a dataset is distributed regularly please choose a distribution release periodicity from the constrained list and indicate the next release date. When the release date becomes historical, a new release date will be calculated based on the publishing periodicity. If a dataset has been published and will remain static please indicate that it is static and indicated when it was released. If a dataset is released on an irregular basis or \u201con-demand\u201d please indicate that it is Irregular and leave release date as null. If a dataset can be published in real-time or near-real-time please indicate that it is continuous and leave release date as null. Notes: see https://www.dublincore.org/specifications/dublin-core/collection-description/frequency/", + "examples": null, + "is_list": false, + "is_optional": false, + "types": { + "type": "string", + "options": [ + "STATIC", + "IRREGULAR", + "CONTINUOUS", + "BIENNIAL", + "ANNUAL", + "BIANNUAL", + "QUARTERLY", + "BIMONTHLY", + "MONTHLY", + "BIWEEKLY", + "WEEKLY", + "SEMIWEEKLY", + "DAILY", + "OTHER", + null + ] + }, + "location": "provenance.temporal.publishingFrequency" + }, + { + "required": false, + "title": "Data Use Limitation", + "description": "Please provide an indication of consent permissions for datasets and/or materials, and relates to the purposes for which datasets and/or material might be removed, stored or used. NOTE: we have extended the DUO to include a value for NO LINKAGE", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.usage.dataUseLimitation" + }, + { + "required": false, + "title": "Data Use Requirements", + "description": "Please indicate fit here are any additional conditions set for use if any, multiple requirements may be provided. Please ensure that these restrictions are documented in access rights information.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.usage.dataUseRequirements" + }, + { + "required": false, + "title": "Citation Requirements'", + "description": "Please provide the text that you would like included as part of any citation that credits this dataset. This is typically just the name of the publisher. No employee details should be provided.'", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "ShortDescription", + "maxLength": 1000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.usage.resourceCreator" + }, + { + "required": false, + "title": "Investigations", + "description": null, + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.usage.investigations" + }, + { + "required": false, + "title": "Citations", + "description": "Please provide the keystone paper associated with the dataset. Also include a list of known citations, if available and should be links to existing resources where the dataset has been used or referenced. Please provide multiple entries, or if you are using a csv upload please provide them as a tab separated list.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "Doi", + "pattern": "^10.\\d{4,9}/[-._;()/:a-zA-Z0-9]+$", + "type": "string" + }, + "location": "accessibility.usage.isReferencedBy" + }, + { + "required": true, + "title": "Access Rights", + "description": "Please provide details for the data access rights", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "LongDescription", + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.access.accessRights" + }, + { + "required": false, + "title": "Access Service", + "description": "Please provide a brief description of the data access services that are available including: environment that is currently available to researchers;additional consultancy and services;any indication of costs associated. If no environment is currently available, please indicate the current plans and timelines when and how data will be made available to researchers Note: This value will be used as default access environment for all datasets submitted by the organisation. However, there will be the opportunity to overwrite this value for each dataset.", + "examples": [ + "https://cnfl.extge.co.uk/display/GERE/Research+Environment+User+Guide" + ], + "is_list": false, + "is_optional": true, + "types": { + "title": "LongDescription", + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.access.accessService" + }, + { + "required": false, + "title": "Organisation Access Request Cost", + "description": "Please provide link(s) to a webpage detailing the commercial model for processing data access requests for the organisation (if available) Definition: Indication of commercial model or cost (in GBP) for processing each data access request by the data custodian.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "LongDescription", + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.access.accessRequestCost" + }, + { + "required": false, + "title": "Access Request Duration", + "description": "Please provide an indication of the typical processing times based on the types of requests typically received.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "LESS 1 WEEK", + "1-2 WEEKS", + "2-4 WEEKS", + "1-2 MONTHS", + "2-6 MONTHS", + "MORE 6 MONTHS", + "VARIABLE", + "NOT APPLICABLE", + "OTHER", + null + ] + }, + "location": "accessibility.access.deliveryLeadTime" + }, + { + "required": true, + "title": "Jurisdiction", + "description": "Please use country code from ISO 3166-1 country codes and the associated ISO 3166-2 for regions, cities, states etc. for the country/state under whose laws the data subjects' data is collected, processed and stored.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.access.jurisdiction" + }, + { + "required": true, + "title": "Data Controller", + "description": "Data Controller means a person/entity who (either alone or jointly or in common with other persons/entities) determines the purposes for which and the way any Data Subject data, specifically personal data or are to be processed.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "LongDescription", + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.access.dataController" + }, + { + "required": false, + "title": "Data Processor", + "description": "A Data Processor, in relation to any Data Subject data, specifically personal data, means any person/entity (other than an employee of the data controller) who processes the data on behalf of the data controller.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "LongDescription", + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + "location": "accessibility.access.dataProcessor" + }, + { + "required": false, + "title": "Access/governance requirements", + "description": "Where access to data come from: TRE/SED, direct access, open acccess, varies based on project.", + "examples": [ + "TRE/SDE" + ], + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "TRE/SDE", + "Direct access", + "Open access", + "Varies based on project" + ] + }, + "location": "accessibility.access.accessServiceCategory" + }, + { + "required": true, + "title": "Controlled Vocabulary", + "description": "List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use \u201cother\u201d and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.formatAndStandards.vocabularyEncodingScheme" + }, + { + "required": true, + "title": "Controlled Vocabulary", + "description": "List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use \u201cother\u201d and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided.", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "title": "StandardisedDataModels", + "$ref": "#/$defs/StandardisedDataModelsEnum" + }, + "location": "accessibility.formatAndStandards.conformsTo" + }, + { + "required": true, + "title": "Controlled Vocabulary", + "description": "List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use \u201cother\u201d and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.formatAndStandards.language" + }, + { + "required": true, + "title": "Controlled Vocabulary", + "description": "List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use \u201cother\u201d and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "accessibility.formatAndStandards.format" + }, + { + "required": false, + "title": "Format and Standards", + "description": "Section includes technical attributes for language vocabularies, sizes etc. and gives researchers facts about and processing the underlying data in the dataset.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "FormatAndStandards", + "location": "accessibility.formatAndStandards" + }, + { + "required": false, + "title": "Linked Datasets", + "description": "If applicable, please provide the DOI of other datasets that have previously been linked to this dataset and their availability. If no DOI is available, please provide the title of the datasets that can be linked, where possible using the same title of a dataset previously onboarded to the HOP. Note: If all the datasets from Gateway organisation can be linked please indicate \u201cALL\u201d and the onboarding portal will automate linkage across the datasets submitted.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "enrichmentAndLinkage.qualifiedRelation" + }, + { + "required": false, + "title": "Derivations", + "description": "Indicate if derived datasets or predefined extracts are available and the type of derivation available. Notes. Single or multiple dimensions can be provided as a derived extract alongside the dataset.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "enrichmentAndLinkage.derivation" + }, + { + "required": false, + "title": "Tools", + "description": "Please provide the URL of any analysis tools or models that have been created for this dataset and are available for further use. Multiple tools may be provided. Note: We encourage users to adopt a model along the lines of https://www.ga4gh.org/news/tool-registry-service-api-enabling-an-interoperable-library-of-genomics-analysis-tools/", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "CommaSeparatedValues", + "pattern": "([^,]+)", + "type": "string" + }, + "location": "enrichmentAndLinkage.tools" + }, + { + "required": false, + "title": "Synthetic Data Web Links", + "description": "Links to locations of information and or raw downloads of synthetic data associated with this dataset", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "title": "Url", + "format": "uri", + "minLength": 1, + "type": "string" + }, + "location": "enrichmentAndLinkage.syntheticDataWebLink" + }, + { + "required": false, + "title": "Enrichment and Linkage", + "description": "This section includes information about related datasets that may have previously been linked, as well as indicating if there is the opportunity to link to other datasets in the future. If a dataset has been enriched and/or derivations, scores and existing tools are available this section allows providers to indicate this to researchers.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "EnrichmentAndLinkage", + "location": "enrichmentAndLinkage" + }, + { + "required": true, + "title": "Statistical Population'", + "description": "Please select one of the following statistical populations for you observation", + "examples": [ + "PERSONS" + ], + "is_list": false, + "is_optional": false, + "types": { + "type": "string", + "options": [ + "PERSONS", + "EVENTS", + "FINDINGS" + ] + }, + "location": "observations.observedNode" + }, + { + "required": true, + "title": "Measured Value", + "description": "Please provide the population size associated with the population type the dataset i.e. 1000 people in a study, or 87 images (MRI) of Knee Usage Note: Used with Statistical Population, which specifies the type of the population in the dataset.", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "int", + "location": "observations.measuredValue" + }, + { + "required": false, + "title": "Disambiguating Description", + "description": "If SNOMED CT term does not provide sufficient detail, please provide a description that disambiguates the population type.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "AbstractText", + "maxLength": 500, + "minLength": 5, + "type": "string" + }, + "location": "observations.disambiguatingDescription" + }, + { + "required": true, + "title": "Observation Date", + "description": "Please provide the date that the observation was made. Some datasets may be continuously updated and the number of records will change regularly, so the observation date provides users with the date that the analysis or query was run to generate the particular observation. Multiple observations can be made i.e. an observation of cumulative COVID positive cases by specimen on the 1/1/2021 could be 2M. On the 8/1/2021 a new observation could be 2.1M. Users can add multiple observations.", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "date", + "location": "observations.observationDate" + }, + { + "required": true, + "title": "Measured Property", + "description": "Initially this will be defaulted to \"COUNT\"", + "examples": null, + "is_list": false, + "is_optional": false, + "types": { + "title": "MeasuredProperty" + }, + "location": "observations.measuredProperty" + }, + { + "required": true, + "title": "Observations", + "description": "Multiple observations about the dataset may be provided and users are expected to provide at least one observation \n(1..*). We will be supporting the schema.org observation model (https://schema.org/Observation) with default values. Users will be encouraged to provide their own statistical populations as the project progresses. \nExample: \n <b> Statistical Population 1 \n </b> type: StatisticalPopulation populationType: Persons numConstraints: 0 \n <b> Statistical Population 2 </b> type: StatisticalPopulation populationType: Events numConstraints: 0 <b> Statistical Population 3 </b> type: StatisticalPopulation populationType: Findings numConstraints: 0 typeOf: Observation observedNode: <b> Statistical Population 1 </b> measuredProperty: count measuredValue: 32937 observationDate: \u201c2017\u201d\"\n", + "examples": null, + "is_list": true, + "is_optional": false, + "types": "Observation", + "location": "observations" + }, + { + "required": true, + "title": "Table Name", + "description": "The name of a table in a dataset.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "str", + "location": "structuralMetadata.name" + }, + { + "required": false, + "title": "Table Name", + "description": "The name of a table in a dataset.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "str", + "location": "structuralMetadata.description" + }, + { + "required": true, + "title": "Column Name", + "description": "The name of a column in a table.", + "examples": null, + "is_list": false, + "is_optional": false, + "types": { + "title": "Name" + }, + "location": "structuralMetadata.elements.name" + }, + { + "required": true, + "title": "Data Type", + "description": "The data type of values in the column", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "str", + "location": "structuralMetadata.elements.dataType" + }, + { + "required": false, + "title": "Column Description", + "description": "A description of a column in a table.", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "str", + "location": "structuralMetadata.elements.description" + }, + { + "required": true, + "title": "Sensitive", + "description": "A True or False value, indicating if the field is sensitive or not", + "examples": null, + "is_list": false, + "is_optional": false, + "types": "bool", + "location": "structuralMetadata.elements.sensitive" + }, + { + "required": true, + "title": "Table Name", + "description": "The name of a table in a dataset.", + "examples": null, + "is_list": true, + "is_optional": false, + "types": "DataElement", + "location": "structuralMetadata.elements" + }, + { + "required": false, + "title": "Structural Metadata", + "description": "Structural metadata about tables, columns and values", + "examples": null, + "is_list": true, + "is_optional": true, + "types": "DataClass", + "location": "structuralMetadata" + }, + { + "required": false, + "title": "Data Categories", + "description": "The type of data that is associated with the samples in the study. Can be several values MIABIS-2.0-13", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Biological samples", + "Survey data", + "Imaging data", + "Medical records", + "National registries", + "Genealogical records", + "Physiological/Biochemical measurements", + "Other" + ] + }, + "location": "tissuesSampleCollection.dataCategories" + }, + { + "required": false, + "title": "Material Type", + "description": "The biospecimen saved from a biological entity for propagation e.g. testing, diagnostics, treatment or research purposes. Can be several values MIABIS-2.0-14", + "examples": null, + "is_list": true, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Blood", + "DNA", + "Faeces", + "Immortalized Cell Lines", + "Isolated Pathogen", + "Other", + "Plasma", + "RNA", + "Saliva", + "Serum", + "Tissue (Frozen)", + "Tissue (FFPE)", + "Urine" + ] + }, + "location": "tissuesSampleCollection.materialType" + }, + { + "required": false, + "title": "Creation Date", + "description": "Date when the tissue sample metadata was created", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "date", + "location": "tissuesSampleCollection.tissueSampleMetadata.creationDate" + }, + { + "required": false, + "title": "Anatomical Site Ontology Code", + "description": "Ontology code for the anatomical site, this code must match an ICD-0-3 format", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "title": "ICD_0_3", + "pattern": "^[C\\d]{3}\\.\\d{4}\\/\\d{1,4}$", + "type": "string" + }, + "location": "tissuesSampleCollection.tissueSampleMetadata.AnatomicalSiteOntologyCode" + }, + { + "required": false, + "title": "Tissue Sample Metadata", + "description": "Metadata related to the tissue sample", + "examples": null, + "is_list": false, + "is_optional": true, + "types": "TissueSampleMetadata", + "location": "tissuesSampleCollection.tissueSampleMetadata" + }, + { + "required": false, + "title": "Collection Type", + "description": "The type of the sample collection. Can be several values [MIABIS-2.0-16](https://github.com/BBMRI-ERIC/miabis/blob/master/Structured-data-and-lists.md#collection-type)", + "examples": null, + "is_list": false, + "is_optional": true, + "types": { + "type": "string", + "options": [ + "Case-control", + "Cohort", + "Cross-sectional", + "Longitudinal", + "Twin-study", + "Quality control", + "Population-based", + "Disease specific", + "Birth cohort", + "Other" + ] + }, + "location": "tissuesSampleCollection.collectionType" + }, + { + "required": false, + "title": "Tissues Sample Collection", + "description": "Metadata collection for Tissue Samples datasets", + "examples": null, + "is_list": true, + "is_optional": true, + "types": "TissuesSampleCollection", + "location": "tissuesSampleCollection" + } + ] +} \ No newline at end of file diff --git a/docs/HDRUK/2.2.1.md b/docs/HDRUK/2.2.1.md index aafd091..d0f370f 100644 --- a/docs/HDRUK/2.2.1.md +++ b/docs/HDRUK/2.2.1.md @@ -175,9 +175,9 @@ Please provide a URL that describes the organisation. Organisation contact point(s) -| title | is_list | is_optional | required | type | -|:---------------------------|:----------|:--------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Organisation Contact Point | False | True | True | ["EmailAddress[{'anyOf': [{'format': 'email', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.EmailAddress.EmailAddress]]', 'null'] | +| title | is_list | is_optional | required | type | +|:---------------------------|:----------|:--------------|:-----------|:-------------------------------------------------------------------------------------------------------| +| Organisation Contact Point | False | True | True | ["EmailAddress[{'anyOf': [{'format': 'email', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -210,9 +210,9 @@ Examples: Please provide a list of relevant and specific keywords that can improve the SEO of your dataset as a comma separated list. Notes: Onboarding portal will suggest keywords based on title, abstract and description. We are compiling a standardised list of keywords and synonyms across datasets to make filtering easier for users. -| title | is_list | is_optional | required | type | -|:---------|:----------|:--------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Keywords | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters]', 'null'] | +| title | is_list | is_optional | required | type | +|:---------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Keywords | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | Examples: @@ -223,9 +223,9 @@ Examples: Alternate dataset identifiers or local identifiers -| title | is_list | is_optional | required | type | -|:------------------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Alternate dataset identifiers | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.ShortDescription.ShortDescription]]', 'null'] | +| title | is_list | is_optional | required | type | +|:------------------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Alternate dataset identifiers | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -300,9 +300,9 @@ A free-text description of the dataset. Gateway Feature: Keywords and text may b Please provide any media associated with the Gateway Organisation using a valid URI for the content. This is an opportunity to provide additional context that could be useful for researchers wanting to understand more about the dataset and its relevance to their research question. The following formats will be accepted .jpg, .png or .svg, .pdf, .xslx or .docx. Note: media asset can be hosted by the organisation or uploaded using the onboarding portal. -| title | is_list | is_optional | required | type | -|:-----------------|:----------|:--------------|:-----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Associated Media | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]', 'null'] | +| title | is_list | is_optional | required | type | +|:-----------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Associated Media | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | Examples: @@ -313,9 +313,9 @@ Examples: Please complete only if the dataset is part of a group or family -| title | is_list | is_optional | required | type | -|:--------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Group | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Union[hdr_schemata.definitions.HDRUK.Url.Url, NoneType, hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters, hdr_schemata.definitions.HDRUK.IsPartOfEnum.IsPartOfEnum]]', 'null'] | +| title | is_list | is_optional | required | type | +|:--------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Group | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | Examples: @@ -335,9 +335,9 @@ Observational, Spatial and Temporal coverage The geographical area covered by the dataset. It is recommended that links are to entries in a well-maintained gazetteer such as https://www.geonames.org/ or https://what3words.com/daring.lion.race. -| title | is_list | is_optional | required | type | -|:--------------------|:----------|:--------------|:-----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Geographic Coverage | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]', 'null'] | +| title | is_list | is_optional | required | type | +|:--------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Geographic Coverage | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | Examples: @@ -591,9 +591,9 @@ This section includes information about how the data can be used and how it is c Please provide an indication of consent permissions for datasets and/or materials, and relates to the purposes for which datasets and/or material might be removed, stored or used. NOTE: we have extended the DUO to include a value for NO LINKAGE -| title | is_list | is_optional | required | type | -|:--------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Data Use Limitation | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.DataUseLimitation.DataUseLimitation]', 'null'] | +| title | is_list | is_optional | required | type | +|:--------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Data Use Limitation | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -602,9 +602,9 @@ Please provide an indication of consent permissions for datasets and/or material Please indicate fit here are any additional conditions set for use if any, multiple requirements may be provided. Please ensure that these restrictions are documented in access rights information. -| title | is_list | is_optional | required | type | -|:----------------------|:----------|:--------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Data Use Requirements | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.DataUseRequirements.DataUseRequirements]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Data Use Requirements | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -613,9 +613,9 @@ Please indicate fit here are any additional conditions set for use if any, multi Please provide the text that you would like included as part of any citation that credits this dataset. This is typically just the name of the publisher. No employee details should be provided.' -| title | is_list | is_optional | required | type | -|:-----------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Citation Requirements' | False | True | False | ["ShortDescription[{'anyOf': [{'maxLength': 1000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.ShortDescription.ShortDescription]]', 'null'] | +| title | is_list | is_optional | required | type | +|:-----------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------| +| Citation Requirements' | False | True | False | ["ShortDescription[{'anyOf': [{'maxLength': 1000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -624,9 +624,9 @@ Please provide the text that you would like included as part of any citation tha None -| title | is_list | is_optional | required | type | -|:---------------|:----------|:--------------|:-----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Investigations | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]', 'null'] | +| title | is_list | is_optional | required | type | +|:---------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Investigations | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -635,9 +635,9 @@ None Please provide the keystone paper associated with the dataset. Also include a list of known citations, if available and should be links to existing resources where the dataset has been used or referenced. Please provide multiple entries, or if you are using a csv upload please provide them as a tab separated list. -| title | is_list | is_optional | required | type | -|:----------|:----------|:--------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Citations | False | True | False | ["Doi[{'anyOf': [{'pattern': '^10.\\\\d{4,9}/[-._;()/:a-zA-Z0-9]+$', 'type': 'string'}, {'type': 'null'}]}]", 'str', 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Doi.Doi]]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------|:----------|:--------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------| +| Citations | False | True | False | ["Doi[{'anyOf': [{'pattern': '^10.\\\\d{4,9}/[-._;()/:a-zA-Z0-9]+$', 'type': 'string'}, {'type': 'null'}]}]", 'str', 'List', 'null'] | @@ -679,9 +679,9 @@ Examples: Please provide link(s) to a webpage detailing the commercial model for processing data access requests for the organisation (if available) Definition: Indication of commercial model or cost (in GBP) for processing each data access request by the data custodian. -| title | is_list | is_optional | required | type | -|:---------------------------------|:----------|:--------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Organisation Access Request Cost | False | True | False | ["LongDescription[{'anyOf': [{'maxLength': 50000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]', 'null'] | +| title | is_list | is_optional | required | type | +|:---------------------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------| +| Organisation Access Request Cost | False | True | False | ["LongDescription[{'anyOf': [{'maxLength': 50000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -701,9 +701,9 @@ Please provide an indication of the typical processing times based on the types Please use country code from ISO 3166-1 country codes and the associated ISO 3166-2 for regions, cities, states etc. for the country/state under whose laws the data subjects' data is collected, processed and stored. -| title | is_list | is_optional | required | type | -|:-------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Jurisdiction | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.Isocountrycode.Isocountrycode]', 'null'] | +| title | is_list | is_optional | required | type | +|:-------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Jurisdiction | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -756,9 +756,9 @@ Section includes technical attributes for language vocabularies, sizes etc. and List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use “other” and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided. -| title | is_list | is_optional | required | type | -|:----------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.ControlledVocabulary.ControlledVocabulary]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -778,9 +778,9 @@ List any relevant terminologies / ontologies / controlled vocabularies, such as List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use “other” and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided. -| title | is_list | is_optional | required | type | -|:----------------------|:----------|:--------------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.Language.Language]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -789,9 +789,9 @@ List any relevant terminologies / ontologies / controlled vocabularies, such as List any relevant terminologies / ontologies / controlled vocabularies, such as ICD 10 Codes, NHS Data Dictionary National Codes or SNOMED CT International, that are being used by the dataset. If the controlled vocabularies are local standards, please make that explicit. If you are using a standard that has not been included in the list, please use “other” and contact support desk to ask for an addition. Notes: More than one vocabulary may be provided. -| title | is_list | is_optional | required | type | -|:----------------------|:----------|:--------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[hdr_schemata.definitions.HDRUK.Format.Format]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Controlled Vocabulary | False | True | True | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -809,9 +809,9 @@ This section includes information about related datasets that may have previousl If applicable, please provide the DOI of other datasets that have previously been linked to this dataset and their availability. If no DOI is available, please provide the title of the datasets that can be linked, where possible using the same title of a dataset previously onboarded to the HOP. Note: If all the datasets from Gateway organisation can be linked please indicate “ALL” and the onboarding portal will automate linkage across the datasets submitted. -| title | is_list | is_optional | required | type | -|:----------------|:----------|:--------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Linked Datasets | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Union[hdr_schemata.definitions.HDRUK.Url.Url, NoneType, hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters]]', 'null'] | +| title | is_list | is_optional | required | type | +|:----------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Linked Datasets | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -820,9 +820,9 @@ If applicable, please provide the DOI of other datasets that have previously bee Indicate if derived datasets or predefined extracts are available and the type of derivation available. Notes. Single or multiple dimensions can be provided as a derived extract alongside the dataset. -| title | is_list | is_optional | required | type | -|:------------|:----------|:--------------|:-----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Derivations | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.AbstractText.AbstractText]]', 'null'] | +| title | is_list | is_optional | required | type | +|:------------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Derivations | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | @@ -831,9 +831,9 @@ Indicate if derived datasets or predefined extracts are available and the type o Please provide the URL of any analysis tools or models that have been created for this dataset and are available for further use. Multiple tools may be provided. Note: We encourage users to adopt a model along the lines of https://www.ga4gh.org/news/tool-registry-service-api-enabling-an-interoperable-library-of-genomics-analysis-tools/ -| title | is_list | is_optional | required | type | -|:--------|:----------|:--------------|:-----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Tools | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]', 'null'] | +| title | is_list | is_optional | required | type | +|:--------|:----------|:--------------|:-----------|:------------------------------------------------------------------------------------------------------------------| +| Tools | False | True | False | ["CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", 'List', 'null'] | diff --git a/docs/HDRUK/2.2.1.structure.json b/docs/HDRUK/2.2.1.structure.json index 73815f5..4908aab 100644 --- a/docs/HDRUK/2.2.1.structure.json +++ b/docs/HDRUK/2.2.1.structure.json @@ -219,7 +219,7 @@ "examples": null, "type": [ "EmailAddress[{'anyOf': [{'format': 'email', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.EmailAddress.EmailAddress]]", + "List", "null" ], "is_list": false, @@ -267,7 +267,7 @@ ], "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters]", + "List", "null" ], "is_list": false, @@ -282,7 +282,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.ShortDescription.ShortDescription]]", + "List", "null" ], "is_list": false, @@ -385,7 +385,7 @@ ], "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]", + "List", "null" ], "is_list": false, @@ -402,7 +402,7 @@ ], "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Union[hdr_schemata.definitions.HDRUK.Url.Url, NoneType, hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters, hdr_schemata.definitions.HDRUK.IsPartOfEnum.IsPartOfEnum]]", + "List", "null" ], "is_list": false, @@ -434,7 +434,7 @@ ], "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]", + "List", "null" ], "is_list": false, @@ -749,7 +749,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.DataUseLimitation.DataUseLimitation]", + "List", "null" ], "is_list": false, @@ -764,7 +764,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.DataUseRequirements.DataUseRequirements]", + "List", "null" ], "is_list": false, @@ -779,7 +779,7 @@ "examples": null, "type": [ "ShortDescription[{'anyOf': [{'maxLength': 1000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.ShortDescription.ShortDescription]]", + "List", "null" ], "is_list": false, @@ -794,7 +794,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]", + "List", "null" ], "is_list": false, @@ -810,7 +810,7 @@ "type": [ "Doi[{'anyOf': [{'pattern': '^10.\\\\d{4,9}/[-._;()/:a-zA-Z0-9]+$', 'type': 'string'}, {'type': 'null'}]}]", "str", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Doi.Doi]]", + "List", "null" ], "is_list": false, @@ -869,7 +869,7 @@ "examples": null, "type": [ "LongDescription[{'anyOf': [{'maxLength': 50000, 'minLength': 2, 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]", + "List", "null" ], "is_list": false, @@ -897,7 +897,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.Isocountrycode.Isocountrycode]", + "List", "null" ], "is_list": false, @@ -969,7 +969,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.ControlledVocabulary.ControlledVocabulary]", + "List", "null" ], "is_list": false, @@ -997,7 +997,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.Language.Language]", + "List", "null" ], "is_list": false, @@ -1012,7 +1012,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[hdr_schemata.definitions.HDRUK.Format.Format]", + "List", "null" ], "is_list": false, @@ -1044,7 +1044,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Union[hdr_schemata.definitions.HDRUK.Url.Url, NoneType, hdr_schemata.definitions.HDRUK.OneHundredFiftyCharacters.OneHundredFiftyCharacters]]", + "List", "null" ], "is_list": false, @@ -1059,7 +1059,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.AbstractText.AbstractText]]", + "List", "null" ], "is_list": false, @@ -1074,7 +1074,7 @@ "examples": null, "type": [ "CommaSeparatedValues[{'anyOf': [{'pattern': '([^,]+)', 'type': 'string'}, {'type': 'null'}]}]", - "typing.List[typing.Optional[hdr_schemata.definitions.HDRUK.Url.Url]]", + "List", "null" ], "is_list": false, diff --git a/hdr_schemata/utils/create_markdown.py b/hdr_schemata/utils/create_markdown.py index 38ef947..68fe80e 100644 --- a/hdr_schemata/utils/create_markdown.py +++ b/hdr_schemata/utils/create_markdown.py @@ -1,5 +1,6 @@ from pydantic import BaseModel, RootModel import pandas as pd +import copy import json import typing import enum @@ -80,7 +81,7 @@ def get_fields(structure, model: type[BaseModel]): "title": field.title, "examples": field.examples, "type": type_names, - # "types": _types, + "types": _types, "is_list": is_list, "is_optional": is_optional, } @@ -131,41 +132,77 @@ def json_to_markdown(structure, level=2): return md - -def traverse_structure(data, form, parent=None): +def form_structure(data, form, parent=None): + data = copy.deepcopy(data) for item in data: k = item.pop("name") if parent: k = parent + "." + k subItems = item.pop("subItems", None) if subItems: - traverse_structure(subItems, form, parent=k) + form_structure(subItems, form, parent=k) types = item.pop("types") - info = None + infos = [] for t in types: + info = None + if t == "null": + continue try: - if t and issubclass(t, RootModel): - info = t.model_json_schema() + if issubclass(t, RootModel): + t_sch = t.model_json_schema() + # Merge dicts with title and the types info in anyOf + if "anyOf" in t_sch: + title = {"title": t_sch["title"]} + info = {**title, **t_sch["anyOf"][0]} + else: + info = t_sch + else: + info = t.__name__ except: ... if type(t) == enum.EnumMeta: info = {"type": "string", "options": [m.value for m in t]} + + if info: + infos.append(info) + _ = item.pop("type") - item["types"] = info - form[k] = item + + if isinstance(infos, list): + # Skip fields where the type is a pydantic type we have defined e.g. "Organisation" + # because we drill down into the subtypes instead + if isinstance(infos[0], str) and infos[0].lower() == k.split(".")[-1]: + continue + else: + item["types"] = infos[0] + else: + item["types"] = infos + # location indicates the json path through the schema e.g. summary.abstract + # provenance.origin.purpose + item["location"] = k + form["schema_fields"].append(item) def create_markdown(Model, path, name): + + def remove_types(data): + for d in data: + d.pop("types") + if d.get("subItems", None): + remove_types(d["subItems"]) + structure = [] get_fields(structure, Model) - # form = {} - # traverse_structure(structure, form) - # print(json.dumps(form, indent=6)) + form = {} + form["schema_fields"] = [] + form_structure(structure, form) + with open(f"{path}/{name}.form.json", "w") as f: + json.dump(form, f, indent=6) with open(f"{path}/{name}.structure.json", "w") as f: - print(json.dumps(structure, indent=6)) + remove_types(structure) json.dump(structure, f, indent=6) md = json_to_markdown(structure)