diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6624300 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: Pytest + +on: [push] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install schema module + run: | + python -m pip install --upgrade pip + python -m pip install setuptools + python -m pip install -e . + + - name: Run pytest + run: | + cd hdr_schemata/tests/ + pytest diff --git a/hdr_schemata/__init__.py b/hdr_schemata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hdr_schemata/definitions/HDRUK/DatasetType.py b/hdr_schemata/definitions/HDRUK/DatasetType.py new file mode 100644 index 0000000..f16cc89 --- /dev/null +++ b/hdr_schemata/definitions/HDRUK/DatasetType.py @@ -0,0 +1,9 @@ +from typing import Optional +from pydantic import RootModel,constr + +#note: contructed as a string of max_length=100 +# in the future we may want to limit this with Enums +class DatasetType(RootModel): + root: Optional[constr(min_length=2, max_length=100)] + + diff --git a/hdr_schemata/definitions/HDRUK/LongAbstractText.py b/hdr_schemata/definitions/HDRUK/LongAbstractText.py new file mode 100644 index 0000000..0c0a773 --- /dev/null +++ b/hdr_schemata/definitions/HDRUK/LongAbstractText.py @@ -0,0 +1,7 @@ +from typing import Optional +from pydantic import RootModel,constr + +class LongAbstractText(RootModel): + root: Optional[constr(min_length=5, max_length=5000)] + + diff --git a/hdr_schemata/definitions/HDRUK/ShortTitle.py b/hdr_schemata/definitions/HDRUK/ShortTitle.py new file mode 100644 index 0000000..198c00b --- /dev/null +++ b/hdr_schemata/definitions/HDRUK/ShortTitle.py @@ -0,0 +1,7 @@ +from typing import Optional +from pydantic import RootModel,constr + +class ShortTitle(RootModel): + root: Optional[constr(min_length=2, max_length=100)] + + diff --git a/hdr_schemata/definitions/HDRUK/TwoHundredFiftyFiveCharacters.py b/hdr_schemata/definitions/HDRUK/TwoHundredFiftyFiveCharacters.py new file mode 100644 index 0000000..60d86a3 --- /dev/null +++ b/hdr_schemata/definitions/HDRUK/TwoHundredFiftyFiveCharacters.py @@ -0,0 +1,6 @@ +from pydantic import RootModel, constr + +class TwoHundredFiftyFiveCharacters(RootModel): + root: constr(min_length=2, max_length=255) + + diff --git a/hdr_schemata/definitions/HDRUK/__init__.py b/hdr_schemata/definitions/HDRUK/__init__.py index b040da7..877421a 100644 --- a/hdr_schemata/definitions/HDRUK/__init__.py +++ b/hdr_schemata/definitions/HDRUK/__init__.py @@ -4,6 +4,7 @@ from .ControlledVocabulary import ControlledVocabulary from .ControlledVocabularyEnum import ControlledVocabularyEnum from .DataUseLimitation import DataUseLimitation +from .DatasetType import DatasetType from .DataUseRequirements import DataUseRequirements from .DeliveryLeadTime import DeliveryLeadTime from .Description import Description @@ -16,6 +17,7 @@ from .Isocountrycode import Isocountrycode from .Language import Language from .LanguageEnum import LanguageEnum +from .LongAbstractText import LongAbstractText from .LongDescription import LongDescription from .MeasuredProperty import MeasuredProperty from .MemberOf import MemberOf @@ -27,10 +29,12 @@ from .Semver import Semver from .Setting import Setting from .ShortDescription import ShortDescription +from .ShortTitle import ShortTitle from .Source import Source from .StandardisedDataModels import StandardisedDataModels from .StandardisedDataModelsEnum import StandardisedDataModelsEnum from .StatisticalPopulationConstrained import StatisticalPopulationConstrained from .TimeLag import TimeLag +from .TwoHundredFiftyFiveCharacters import TwoHundredFiftyFiveCharacters from .Url import Url from .Uuidv4 import Uuidv4 diff --git a/hdr_schemata/examples/GWDM/1.0/example.json b/hdr_schemata/examples/GWDM/1.0/example.json new file mode 100644 index 0000000..b8183c7 --- /dev/null +++ b/hdr_schemata/examples/GWDM/1.0/example.json @@ -0,0 +1,126 @@ +{ + "required": { + "gatewayId": "1234", + "gatewayPid": "5124f2", + "issued": "2020-08-05T14:35:59Z", + "modified": "2021-01-28T14:15:46Z", + "revisions": [ + { + "version": "1.0.0", + "url": "https://d5faf9c6-6c34-46d7-93c4-7706a5436ed9" + }, + { + "version": "2.0.0", + "url": "https://a7ddefbd-31d9-4703-a738-256e4689f76a" + }, + { + "version": "0.0.1", + "url": "https://9e798632-442a-427b-8d0e-456f754d28dc" + }, + { + "version": "2.1.1", + "url": "https://a7ddefbd-31d9-4703-a738-256e4689f76a" + } + ] + }, + "summary": { + "abstract": "Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations", + "contactPoint": "susheel.varma@hdruk.ac.uk", + "keywords": "Preprints,Papers,HDR UK", + "controlledKeywords": "Papers", + "datasetType": "list of papers", + "description": "Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations\n\nThis will include:\n- Papers\n- COVID-19 Papers\n- COVID-19 Preprint", + "doiName": "10.1093/ije/dyx196", + "shortTitle": "HDR UK Papers & Preprints", + "title": "Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations", + "publisher": { + "publisherName": "HEALTH DATA RESEARCH UK" + } + }, + "coverage": { + "pathway": "NOT APPLICABLE", + "physicalSampleAvailability": "NOT AVAILABLE", + "spatial": "https://www.geonames.org/countries/GB/united-kingdom.html", + "followup": "UNKNOWN", + "typicalAgeRange": "0-0" + }, + "provenance": { + "origin": { + "purpose": "OTHER", + "source": "MACHINE GENERATED", + "collectionSituation": "OTHER" + }, + "temporal": { + "endDate": "2022-04-30", + "startDate": "2020-03-31", + "timeLag": "NOT APPLICABLE", + "accrualPeriodicity": "DAILY", + "distributionReleaseDate": "2020-11-27" + } + }, + "accessibility": { + "access": { + "deliveryLeadTime": "OTHER", + "jurisdiction": "GB-ENG", + "dataController": "HDR UK", + "dataProcessor": "HDR UK", + "accessRights": "https://raw.githubusercontent.com/HDRUK/papers/master/LICENSE", + "accessService": "https://github.com/HDRUK/papers", + "accessRequestCost": "Free" + }, + "usage": { + "dataUseLimitation": "GENERAL RESEARCH USE", + "dataUseRequirement": "RETURN TO DATABASE OR RESOURCE", + "resourceCreator": "HDR UK Science Team" + }, + "formatAndStandards": { + "vocabularyEncodingSchemes": "OTHER", + "conformsTo": "OTHER", + "languages": "en", + "formats": "CSV,JSON" + } + }, + "linkage": { + "isGeneratedUsing": "something", + "dataUses": "dunno", + "isReferenceIn": "10.5281/zenodo.326615", + "tools": "https://github.com/HDRUK/papers", + "datasetLinkage": { + "isDerivedFrom": "https://web.www.healthdatagateway.org/dataset/fd8d0743-344a-4758-bb97-f8ad84a37357", + "isPartOf": "NOT APPLICABLE", + "isMemberOf": "blah", + "linkedDatasets": "https://web.www.healthdatagateway.org/dataset/fd8d0743-344a-4758-bb97-f8ad84a37357" + }, + "investigations": "https://github.com/HDRUK/papers" + }, + "observations": [ + { + "observedNode": "FINDINGS", + "measuredValue": 575, + "observationDate": "2020-11-27", + "measuredProperty": "Count", + "disambiguatingDescription": "Number of papers with affiliation and/or acknowledgement to HDR UK" + } + ], + "structuralMetadata": [ + { + "name": "table1", + "description": "this is table 1", + "columns": [ + { + "name": "column1", + "description": "this is column1", + "dataType": "String", + "sensitive": false, + "values": [ + { + "name": "value", + "description": "this is value1", + "frequency":50 + } + ] + } + ] + } + ] +} diff --git a/hdr_schemata/models/HDRUK/2.1.2/example.json b/hdr_schemata/examples/HDRUK/2.1.2/example.json similarity index 100% rename from hdr_schemata/models/HDRUK/2.1.2/example.json rename to hdr_schemata/examples/HDRUK/2.1.2/example.json diff --git a/hdr_schemata/models/GWDM/1.0/schema.json b/hdr_schemata/models/GWDM/1.0/schema.json new file mode 100644 index 0000000..09d8db7 --- /dev/null +++ b/hdr_schemata/models/GWDM/1.0/schema.json @@ -0,0 +1,1453 @@ +{ + "$defs": { + "AbstractText": { + "anyOf": [ + { + "maxLength": 500, + "minLength": 5, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "AbstractText" + }, + "Access": { + "additionalProperties": false, + "properties": { + "accessRights": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Optional link(s) or a description of where the license associated to accessing this dataset", + "example": "https://raw.githubusercontent.com/HDRUK/papers/master/LICENSE", + "title": "Access Rights" + }, + "accessService": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "default": null, + "description": "", + "example": "The SAIL Databank is powered by the UK Secure e-Research Platform (UKSeRP). Following approval through safeguard processes, access to project-specific data within the secure environment is permitted using two-factor authentication.", + "title": "Access Service" + }, + "accessRequestCost": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "default": null, + "description": "", + "example": "Data provision is free from SAIL. Overall project costing depends on the number of people that require access to the SAIL Gateway, the activities that SAIL needs to complete (e.g. loading non-standard datasets), data refreshes, analytical work required, disclosure control process, and special case technological requirements.", + "title": "Organisation Access Request Cost" + }, + "deliveryLeadTime": { + "anyOf": [ + { + "$ref": "#/$defs/DeliveryLeadTime" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An arbitrary guess at the time to gain access to the dataset...", + "example": "2-6 MONTHS", + "title": "Access Request Duration" + }, + "jurisdiction": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Comma separated country codes of where the data jurisdiction is.", + "example": "GB-WLS,GB-GBN,GB-SCT", + "title": "Jurisdiction" + }, + "dataController": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "description": "Name of the data controller", + "example": "SAIL Databank", + "title": "Data Controller" + }, + "dataProcessor": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the data processors", + "example": "SAIL Databank", + "title": "Data Processor" + } + }, + "required": [ + "accessRights", + "jurisdiction", + "dataController" + ], + "title": "Access", + "type": "object" + }, + "Accessibility": { + "additionalProperties": false, + "properties": { + "usage": { + "anyOf": [ + { + "$ref": "#/$defs/Usage" + }, + { + "type": "null" + } + ], + "default": null, + "description": "This section includes information about how the data can be used and how it is currently being used", + "title": "Usage" + }, + "access": { + "allOf": [ + { + "$ref": "#/$defs/Access" + } + ], + "description": "This section includes information about data access" + }, + "formatAndStandards": { + "anyOf": [ + { + "$ref": "#/$defs/FormatAndStandards" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Section includes technical attributes for language vocabularies, sizes etc. and gives researchers facts about and processing the underlying data in the dataset.", + "title": "Format and Standards" + } + }, + "required": [ + "access" + ], + "title": "Accessibility", + "type": "object" + }, + "AgeRange": { + "anyOf": [ + { + "pattern": "Not Known|(150|1[0-4][0-9]|[0-9]|[1-8][0-9]|9[0-9])-(150|1[0-4][0-9]|[0-9]|[1-8][0-9]|9[0-9])", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "AgeRange" + }, + "CommaSeparatedValues": { + "anyOf": [ + { + "pattern": "([^,]+)", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "CommaSeparatedValues" + }, + "Coverage": { + "properties": { + "spatial": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of countries where the data was taken from", + "example": "United Kingdom,Wales,England", + "title": "Spatial" + }, + "physicalSampleAvailability": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A list of what the dataset actually contains in terms of sample measurements", + "example": "DNA,PLASMA,SERUM,URINE,WHOLE BLOOD", + "title": "Physical Sample Availability" + }, + "pathway": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Long description of the clinical/diagnostic/treatment pathway if applicable. This could include if the dataset is from a single speciality or area, a single tier of care, linked across two tier (e.g. primary and secondary care), or an integrated care record covering the whole patient pathway.", + "example": "The lookup contains references to link data held elsewhere on:\n\u2022 individuals appearing as defendants in criminal cases dealt with by the magistrates' or Crown Court in England and Wales (including Youth Courts). \n\u2022 individuals supervised by the probation service in England and Wales\n\u2022 individuals serving custodial sentences in England & Wales who appear within records from the prison data source, p-NOMIS. Young Offenders are included if resident at prisons or Young Offender Institutes (YOIs) that use p-NOMIS, however, this excludes the majority of Secure Schools and Secure Training Centres. \"\n\n\"The linking dataset includes a person ID and link to record in other data first datasets for: \n\u2022 Disposals in the magistrates\u2019 court from 1 January 2011 to 31 December 2020\n\u2022 Disposals in the Crown Court from 1 January 2013 to 31 December 2020\n\u2022 Custodial sentences of offenders in custody from January 2011 to September 2021 (including sentences begun before 2011) \n\u2022 Offender probation records from January 2014 to December 2020.", + "title": "Pathway" + }, + "followup": { + "anyOf": [ + { + "$ref": "#/$defs/Followup" + }, + { + "type": "null" + } + ], + "default": null, + "description": "What is the typical time span that a patient appears in the dataset (follow up period)", + "example": "CONTINUOUS", + "title": "Followup" + }, + "typicalAgeRange": { + "anyOf": [ + { + "$ref": "#/$defs/AgeRange" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Age range in whole years of participants in the dataset. Please provide range in the following format '[min age] \u2013 [max age]' where both the minimum and maximum are whole numbers (integers).", + "example": "1-150", + "title": "Typical Age Range" + } + }, + "title": "Coverage", + "type": "object" + }, + "DataColumn": { + "additionalProperties": true, + "properties": { + "name": { + "allOf": [ + { + "$ref": "#/$defs/Name" + } + ], + "description": "The name of a column in a table.", + "title": "Column Name" + }, + "dataType": { + "description": "The data type of values in the column", + "title": "Data Type", + "type": "string" + }, + "description": { + "anyOf": [ + { + "maxLength": 20000, + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A description of a column in a table.", + "title": "Column Description" + }, + "sensitive": { + "description": "A True or False value, indicating if the field is sensitive or not", + "title": "Sensitive", + "type": "boolean" + }, + "values": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/DataValue" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "description": "Data values contained within the column", + "title": "Values" + } + }, + "required": [ + "name", + "dataType", + "sensitive", + "values" + ], + "title": "DataColumn", + "type": "object" + }, + "DataTable": { + "additionalProperties": false, + "properties": { + "name": { + "anyOf": [ + { + "maxLength": 500, + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The name of a table in a dataset.", + "title": "Table Name" + }, + "description": { + "anyOf": [ + { + "maxLength": 20000, + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A description of a table in a dataset.", + "title": "Table Description" + }, + "columns": { + "description": "A list of columns contained within a table in a dataset.", + "items": { + "$ref": "#/$defs/DataColumn" + }, + "title": "Data Columns", + "type": "array" + } + }, + "required": [ + "name", + "columns" + ], + "title": "DataTable", + "type": "object" + }, + "DataValue": { + "additionalProperties": true, + "properties": { + "name": { + "allOf": [ + { + "$ref": "#/$defs/Name" + } + ], + "description": "Unique value in a column .", + "title": "Value Name" + }, + "description": { + "anyOf": [ + { + "maxLength": 20000, + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A description of a unique value in a column.", + "title": "Value Description" + }, + "frequency": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The frequency of occurrance of a value in a column", + "title": "Value Frequency" + } + }, + "required": [ + "name" + ], + "title": "DataValue", + "type": "object" + }, + "DatasetLinkage": { + "additionalProperties": false, + "properties": { + "isDerivedFrom": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Indicate if derived datasets or predefined extracts are available and the type of derivation available. Notes. Single or multiple dimensions can be provided as a derived extract alongside the dataset", + "example": "Data will be minimised as appropriate relative to the data access application", + "title": "Derivations" + }, + "isPartOf": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "If the dataset is part of a group or family", + "example": "UKCRC Tissue Directory and Coordination Centre", + "title": "Is PartOf" + }, + "isMemberOf": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Dataset is a member of XXX(?)", + "title": "Is MemberOf" + }, + "linkedDatasets": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Links to other datasets.", + "example": "Yes. To any SAIL dataset & reference data.,ALL", + "title": "Linked Datasets" + } + }, + "title": "DatasetLinkage", + "type": "object" + }, + "DatasetType": { + "anyOf": [ + { + "maxLength": 100, + "minLength": 2, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "DatasetType" + }, + "DeliveryLeadTime": { + "enum": [ + "LESS 1 WEEK", + "1-2 WEEKS", + "2-4 WEEKS", + "1-2 MONTHS", + "2-6 MONTHS", + "MORE 6 MONTHS", + "VARIABLE", + "NOT APPLICABLE", + "OTHER", + null + ], + "title": "DeliveryLeadTime" + }, + "Doi": { + "anyOf": [ + { + "pattern": "^10.\\d{4,9}/[-._;()/:a-zA-Z0-9]+$", + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Doi" + }, + "Followup": { + "enum": [ + "0 - 6 MONTHS", + "6 - 12 MONTHS", + "1 - 10 YEARS", + "> 10 YEARS", + "UNKNOWN", + "CONTINUOUS", + "OTHER", + null + ], + "title": "Followup" + }, + "FormatAndStandards": { + "additionalProperties": false, + "properties": { + "vocabularyEncodingSchemes": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Code value of the ontology vocabulary encoding", + "example": "OPCS4,NHS NATIONAL CODES,ICD10,OTHER", + "title": "Controlled Vocabulary" + }, + "conformsTo": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "What the vocabulary conforms to.", + "example": "LOCAL,NHS DATA DICTIONARY", + "title": "Conforms To" + }, + "languages": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Language code(s) of the language of the dataset metadata and underlying data is made available.", + "example": "en", + "title": "Language Code(s)" + }, + "formats": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Format(s) the dataset can be made available in", + "example": "CSV,JSON,SQL database table", + "title": "Dataset Format" + } + }, + "required": [ + "vocabularyEncodingSchemes", + "conformsTo", + "languages", + "formats" + ], + "title": "FormatAndStandards", + "type": "object" + }, + "Linkage": { + "additionalProperties": false, + "properties": { + "isGeneratedUsing": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "??", + "title": "Is Generated Using" + }, + "associatedMedia": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Any media associated with the Gateway Organisation using a valid URI for the content. This is an opportunity to provide additional context that could be useful for researchers wanting to understand more about the dataset and its relevance to their research question", + "example": "https://popdatasci.swan.ac.uk/centres-of-excellence/sail/,https://www.youtube.com/watch?v=ZK9-Jw3uVkw,https://saildatabank.com/,https://saildatabank.com/about-us/", + "title": "Associated Media" + }, + "dataUses": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "??", + "title": "Data Uses" + }, + "isReferenceIn": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Rhe keystone paper associated with the dataset. Also include a list of known citations, if available and should be links to existing resources where the dataset has been used or referenced.", + "title": "Is Reference in" + }, + "tools": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "URL of any analysis tools or models that have been created for this dataset and are available for further use", + "example": "https://conceptlibrary.saildatabank.com/", + "title": "Tools" + }, + "datasetLinkage": { + "anyOf": [ + { + "$ref": "#/$defs/DatasetLinkage" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Dataset Linkage copied over from", + "title": "Dataset Linkage" + }, + "investigations": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Please provide the keystone paper associated with the dataset.", + "example": "https://digital.nhs.uk/services/data-access-request-service-dars/register-of-approved-data-releases", + "title": "Investigations" + } + }, + "title": "Linkage", + "type": "object" + }, + "LongAbstractText": { + "anyOf": [ + { + "maxLength": 5000, + "minLength": 5, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "LongAbstractText" + }, + "LongDescription": { + "anyOf": [ + { + "maxLength": 50000, + "minLength": 2, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "LongDescription" + }, + "MeasuredProperty": { + "title": "MeasuredProperty" + }, + "Name": { + "title": "Name" + }, + "Observation": { + "additionalProperties": false, + "properties": { + "observedNode": { + "allOf": [ + { + "$ref": "#/$defs/StatisticalPopulationConstrained" + } + ], + "description": "Please select one of the following statistical populations for you observation", + "examples": [ + "PERSONS" + ], + "title": "Statistical Population" + }, + "measuredValue": { + "description": "Please provide the population size associated with the population type the dataset i.e. 1000 people in a study, or 87 images (MRI) of Knee Usage Note: Used with Statistical Population, which specifies the type of the population in the dataset.", + "title": "Measured Value", + "type": "integer" + }, + "disambiguatingDescription": { + "anyOf": [ + { + "$ref": "#/$defs/AbstractText" + }, + { + "type": "null" + } + ], + "default": null, + "description": "If SNOMED CT term does not provide sufficient detail, please provide a description that disambiguates the population type.", + "title": "Disambiguating Description" + }, + "observationDate": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "format": "date-time", + "type": "string" + } + ], + "description": "Please provide the date that the observation was made. Some datasets may be continuously updated and the number of records will change regularly, so the observation date provides users with the date that the analysis or query was run to generate the particular observation. Multiple observations can be made i.e. an observation of cumulative COVID positive cases by specimen on the 1/1/2021 could be 2M. On the 8/1/2021 a new observation could be 2.1M. Users can add multiple observations.", + "title": "Observation Date" + }, + "measuredProperty": { + "allOf": [ + { + "$ref": "#/$defs/MeasuredProperty" + } + ], + "description": "Initially this will be defaulted to \"COUNT\"", + "title": "Measured Property" + } + }, + "required": [ + "observedNode", + "measuredValue", + "observationDate", + "measuredProperty" + ], + "title": "Observation", + "type": "object" + }, + "Origin": { + "additionalProperties": false, + "properties": { + "purpose": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Indicates the purpose(s) that the dataset was collected.", + "example": "ADMINISTRATIVE,STATUTORY", + "title": "Purpose" + }, + "source": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Indicates the source of the data extraction", + "example": "PAPER BASED,ELECTRONIC SURVEY", + "title": "Source" + }, + "collectionSituation": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Indicate the setting(s) where data was collected. Multiple settings may be provided", + "example": "IN-PATIENTS,PRIMARY CARE", + "title": "Setting" + } + }, + "title": "Origin", + "type": "object" + }, + "Periodicity": { + "enum": [ + "STATIC", + "IRREGULAR", + "CONTINUOUS", + "BIENNIAL", + "ANNUAL", + "BIANNUAL", + "QUARTERLY", + "BIMONTHLY", + "MONTHLY", + "BIWEEKLY", + "WEEKLY", + "SEMIWEEKLY", + "DAILY", + "OTHER", + null + ], + "title": "Periodicity" + }, + "Provenance": { + "additionalProperties": false, + "properties": { + "origin": { + "anyOf": [ + { + "$ref": "#/$defs/Origin" + }, + { + "type": "null" + } + ], + "default": null + }, + "temporal": { + "$ref": "#/$defs/Temporal" + } + }, + "required": [ + "temporal" + ], + "title": "Provenance", + "type": "object" + }, + "Publisher": { + "properties": { + "publisherName": { + "anyOf": [ + { + "$ref": "#/$defs/Name" + }, + { + "type": "null" + } + ], + "description": "The organisation responsible for running or supporting the data access request process, as well as publishing and maintaining the metadata. In most this will be the same as the HDR UK Organisation (Hub or Alliance Member)/", + "example": "SAIL", + "title": "Publisher name" + }, + "publisherGatewayId": { + "anyOf": [ + { + "maxLength": 50, + "minLength": 2, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The link to an ID somewhere in the gateway where more information on the publisher can be retrieved.", + "title": "Publisher gateway id" + } + }, + "required": [ + "publisherName" + ], + "title": "Publisher", + "type": "object" + }, + "Required": { + "properties": { + "gatewayId": { + "description": "Need a field in Mauro that captures the datasetID to link to gateway database - or can we just use the one created in Mauro?", + "maxLength": 50, + "minLength": 2, + "title": "Gatewayid", + "type": "string" + }, + "gatewayPid": { + "description": "Need a field in Mauro that captures the dataset pid to link to gateway database", + "maxLength": 50, + "minLength": 2, + "title": "Gatewaypid", + "type": "string" + }, + "issued": { + "description": "Aren't issued and modified always the same because of versioning? Is that fine to duplicate because datasets in dcat might look different?", + "format": "date-time", + "title": "Issued", + "type": "string" + }, + "modified": { + "description": "Aren't issued and modified always the same because of versioning? Is that fine to duplicate because datasets in dcat might look different?", + "format": "date-time", + "title": "Modified", + "type": "string" + }, + "revisions": { + "items": { + "$ref": "#/$defs/Revision" + }, + "title": "Revisions", + "type": "array" + } + }, + "required": [ + "gatewayId", + "gatewayPid", + "issued", + "modified", + "revisions" + ], + "title": "Required", + "type": "object" + }, + "Revision": { + "properties": { + "version": { + "description": "Version number used for previous version of this dataset", + "example": "6.0.0", + "maxLength": 100, + "minLength": 2, + "title": "revision version", + "type": "string" + }, + "url": { + "allOf": [ + { + "$ref": "#/$defs/Url" + } + ], + "description": "Some url with a reference to the record of a previous version of this dataset", + "example": "https://api.service.nhs.uk/health-research-data-catalogue/datasetrevisions/841f7da2-b018-41f6-b4ae-2e0aadab6561", + "title": "revision url" + } + }, + "required": [ + "version", + "url" + ], + "title": "Revision", + "type": "object" + }, + "ShortDescription": { + "anyOf": [ + { + "maxLength": 1000, + "minLength": 2, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "ShortDescription" + }, + "ShortTitle": { + "anyOf": [ + { + "maxLength": 100, + "minLength": 2, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "ShortTitle" + }, + "StatisticalPopulationConstrained": { + "enum": [ + "PERSONS", + "EVENTS", + "FINDINGS" + ], + "title": "StatisticalPopulationConstrained", + "type": "string" + }, + "Summary": { + "properties": { + "title": { + "allOf": [ + { + "$ref": "#/$defs/TwoHundredFiftyFiveCharacters" + } + ], + "description": "The main title of the dataset", + "example": "Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations", + "title": "Title" + }, + "shortTitle": { + "anyOf": [ + { + "$ref": "#/$defs/ShortTitle" + }, + { + "type": "null" + } + ], + "description": "A shorter descriptive title of the dataset", + "example": "ONS 2011 Census Wales (CENW)", + "title": "Shorttitle" + }, + "doiName": { + "anyOf": [ + { + "$ref": "#/$defs/Doi" + }, + { + "type": "null" + } + ], + "description": "DOI associated to this dataset", + "example": "10.1093/ije/dyx196", + "title": "Doiname" + }, + "abstract": { + "allOf": [ + { + "$ref": "#/$defs/LongAbstractText" + } + ], + "description": "Longer abstract detailing the dataset.", + "example": "COVID-19 Key Worker Testing Results data is required by NHS Digital to support COVID-19 requests for linkage, analysis and dissemination to other organisations who require the data in a timely manner.", + "title": "Abstract" + }, + "keywords": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Comma separated key words associated to this dataset.", + "example": "Preprints,Papers,HDR UK", + "title": "Keywords" + }, + "controlledKeywords": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Keywords that have been filtered and limited", + "title": "Controlled Keywords" + }, + "contactPoint": { + "anyOf": [ + { + "format": "email", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "email of a person who can be the main contact point of this dataset", + "example": "susheel.varma@hdruk.ac.uk", + "title": "Contact Point" + }, + "datasetType": { + "anyOf": [ + { + "$ref": "#/$defs/DatasetType" + }, + { + "type": "null" + } + ], + "description": "What type of dataset is this?", + "title": "Dataset type" + }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/LongDescription" + }, + { + "type": "null" + } + ], + "description": "Longer description of the dataset in detail", + "example": "Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations\n\nThis will include:\n- Papers\n- COVID-19 Papers\n- COVID-19 Preprint", + "title": "Description" + }, + "publisher": { + "anyOf": [ + { + "$ref": "#/$defs/Publisher" + }, + { + "type": "null" + } + ], + "description": "Link to details about the publisher of this dataset", + "title": "Publisher" + } + }, + "required": [ + "title", + "shortTitle", + "doiName", + "abstract", + "keywords", + "controlledKeywords", + "contactPoint", + "datasetType", + "description", + "publisher" + ], + "title": "Summary", + "type": "object" + }, + "Temporal": { + "additionalProperties": false, + "properties": { + "startDate": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The start of the time period that the dataset provides coverage for", + "example": "12/03/2020", + "title": "Start Date" + }, + "endDate": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The end of the time period that the dataset provides coverage for", + "example": "12/03/2020", + "title": "End Date" + }, + "timeLag": { + "allOf": [ + { + "$ref": "#/$defs/TimeLag" + } + ], + "description": "Rypical time-lag between an event and the data for that event appearing in the dataset", + "example": "LESS 1 WEEK", + "title": "Time Lag" + }, + "accrualPeriodicity": { + "allOf": [ + { + "$ref": "#/$defs/Periodicity" + } + ], + "description": "frequency of distribution release. If a dataset is distributed regularly please choose a distribution release periodicity from the constrained list and indicate the next release date. When the release date becomes historical, a new release date will be calculated based on the publishing periodicity.", + "example": "MONTHLY" + }, + "distributionReleaseDate": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Date of the latest release of the dataset. If this is a regular release i.e. quarterly, or this is a static dataset please complete this alongside Periodicity.", + "title": "Release Date" + } + }, + "required": [ + "startDate", + "timeLag", + "accrualPeriodicity" + ], + "title": "Temporal", + "type": "object" + }, + "TimeLag": { + "enum": [ + "LESS 1 WEEK", + "1-2 WEEKS", + "2-4 WEEKS", + "1-2 MONTHS", + "2-6 MONTHS", + "MORE 6 MONTHS", + "VARIABLE", + "NO TIMELAG", + "NOT APPLICABLE", + "OTHER", + null + ], + "title": "TimeLag" + }, + "TwoHundredFiftyFiveCharacters": { + "maxLength": 255, + "minLength": 2, + "title": "TwoHundredFiftyFiveCharacters", + "type": "string" + }, + "Url": { + "anyOf": [ + { + "format": "uri", + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Url" + }, + "Usage": { + "additionalProperties": false, + "properties": { + "dataUseLimitation": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Any restrictions to its usage", + "example": "GENERAL RESEARCH USE,PROJECT SPECIFIC RESTRICTIONS", + "title": "Data Use Limitation" + }, + "dataUseRequirement": { + "anyOf": [ + { + "$ref": "#/$defs/CommaSeparatedValues" + }, + { + "type": "null" + } + ], + "description": "Any requirements needed for data usage", + "example": "PROJECT SPECIFIC RESTRICTIONS,TIME LIMIT ON USE,USER SPECIFIC RESTRICTION", + "title": "Data Use Requirements" + }, + "resourceCreator": { + "anyOf": [ + { + "$ref": "#/$defs/ShortDescription" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Who has created this resource", + "example": "Ministry of Justice", + "title": "Resource Creator" + } + }, + "required": [ + "dataUseLimitation", + "dataUseRequirement" + ], + "title": "Usage", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "required": { + "allOf": [ + { + "$ref": "#/$defs/Required" + } + ], + "description": "required metadata needed for the GWDM" + }, + "summary": { + "allOf": [ + { + "$ref": "#/$defs/Summary" + } + ], + "description": "Summary of metadata describing key pieces of information." + }, + "coverage": { + "anyOf": [ + { + "$ref": "#/$defs/Coverage" + }, + { + "type": "null" + } + ], + "description": "Spatial and Temporal coverage", + "title": "Coverage" + }, + "provenance": { + "anyOf": [ + { + "$ref": "#/$defs/Provenance" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Provenance information", + "title": "Provenance" + }, + "accessibility": { + "allOf": [ + { + "$ref": "#/$defs/Accessibility" + } + ], + "description": "Accessibility information." + }, + "linkage": { + "anyOf": [ + { + "$ref": "#/$defs/Linkage" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Linkage and enrichment.", + "title": "Linkage" + }, + "observations": { + "description": "Obsservations", + "items": { + "$ref": "#/$defs/Observation" + }, + "title": "Observations", + "type": "array" + }, + "structuralMetadata": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/DataTable" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Descriptions of all tables and data elements that can be included in the dataset", + "title": "Structural Metadata" + } + }, + "required": [ + "required", + "summary", + "coverage", + "accessibility", + "observations" + ], + "title": "Gwdm10", + "type": "object" +} \ No newline at end of file diff --git a/hdr_schemata/models/GWDM/__init__.py b/hdr_schemata/models/GWDM/__init__.py new file mode 100644 index 0000000..f5cc9f9 --- /dev/null +++ b/hdr_schemata/models/GWDM/__init__.py @@ -0,0 +1 @@ +from .v1_0 import Gwdm10 diff --git a/hdr_schemata/models/GWDM/base/Access.py b/hdr_schemata/models/GWDM/base/Access.py new file mode 100644 index 0000000..b9c9c30 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Access.py @@ -0,0 +1,82 @@ +from typing import Optional, List, Union +from pydantic import BaseModel, Field, constr +from hdr_schemata.definitions.HDRUK import * + +class Access(BaseModel): + class Config: + extra = 'forbid' + + #note: do we want to make this CommaSeparatedValues of URLs? + # dont allow a description of the license? + accessRights: Optional[CommaSeparatedValues] = Field( + ..., + description='Optional link(s) or a description of where the license associated to accessing this dataset', + example='https://raw.githubusercontent.com/HDRUK/papers/master/LICENSE', + title='Access Rights' + ) + + #note: do we want to limit these to actual accessibly service objects? + # e.g.: Optional[AccessServiceEnum] + # options: [SAIL,HIC,DARE,...,OTHER] + # + #note: do we want to include something accessServiceType (?) + # options: [TRE, SDE, University, SafeHaven, ... ] + accessService: Optional[LongDescription] = Field( + None, + description='', + example="The SAIL Databank is powered by the UK Secure e-Research Platform (UKSeRP). Following approval through safeguard processes, access to project-specific data within the secure environment is permitted using two-factor authentication.", + title='Access Service', + ) + + + #note: as above... having a long description seems odd to me... + # we could associate this with an Optional[AccessService] + # class AccessService(BaseModel): + # serviceName: ShortTitle + # serviceDescription: LongDescription + # cost: 'free'|'paid'|'other' + # timeToAcess: DeliveryLeadTime + accessRequestCost: Optional[LongDescription] = Field( + None, + description='', + example="Data provision is free from SAIL. Overall project costing depends on the number of people that require access to the SAIL Gateway, the activities that SAIL needs to complete (e.g. loading non-standard datasets), data refreshes, analytical work required, disclosure control process, and special case technological requirements.", + title='Organisation Access Request Cost', + ) + + #note: related to above... this is hard to know or guess.. + # not useful to a researcher as they can tell this is made up... + # may remove or class as a 'AccessService' + deliveryLeadTime: Optional[DeliveryLeadTime] = Field( + None, + description='An arbitrary guess at the time to gain access to the dataset...', + example='2-6 MONTHS', + title='Access Request Duration', + ) + + #note: May want to make this a CommaSeparetedListIsoCountryCode + # e.g. GB-XXX + jurisdiction: Optional[CommaSeparatedValues] = Field( + ..., + description="Comma separated country codes of where the data jurisdiction is.", + example="GB-WLS,GB-GBN,GB-SCT", + title='Jurisdiction', + ) + + #note: this could also be associated with the AccessService? + # could the dataController not be the TRE for example? + # terminology could be confusing here... + dataController: Optional[LongDescription] = Field( + ..., + description="Name of the data controller", + example="SAIL Databank", + title='Data Controller', + ) + + #note: as with dataController-- what does this mean? + # are these often different? + dataProcessor: Optional[LongDescription] = Field( + None, + description='Name of the data processors', + example='SAIL Databank', + title='Data Processor', + ) diff --git a/hdr_schemata/models/GWDM/base/Accessibility.py b/hdr_schemata/models/GWDM/base/Accessibility.py new file mode 100644 index 0000000..0aa9497 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Accessibility.py @@ -0,0 +1,27 @@ +from typing import Optional +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +from .Usage import Usage +from .Access import Access +from .FormatAndStandards import FormatAndStandards + +class Accessibility(BaseModel): + class Config: + extra = 'forbid' + + usage: Optional[Usage] = Field( + None, + description='This section includes information about how the data can be used and how it is currently being used', + title='Usage', + ) + access: Access = Field( + ..., + description='This section includes information about data access', + title='Access', + ) + formatAndStandards: Optional[FormatAndStandards] = Field( + None, + description='Section includes technical attributes for language vocabularies, sizes etc. and gives researchers facts about and processing the underlying data in the dataset.', + title='Format and Standards', + ) diff --git a/hdr_schemata/models/GWDM/base/Coverage.py b/hdr_schemata/models/GWDM/base/Coverage.py new file mode 100644 index 0000000..3764d81 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Coverage.py @@ -0,0 +1,72 @@ +from datetime import date, datetime +from enum import Enum +from typing import List, Optional, Union + +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr + +from hdr_schemata.definitions.HDRUK import * + +class Coverage(BaseModel): + + #note: limit these to country codes? + # what about regions? surely it's more interesting to know: + # scotland:lothian, england:yorkshire, etc. + spatial: Optional[CommaSeparatedValues] = Field( + None, + description="List of countries where the data was taken from", + example="United Kingdom,Wales,England", + title='Spatial' + ) + + + #note: limit these instead of arbitrary CommaSeparatedValues + # see: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/PhysicalSampleAvailability.py + #note: this would also be useful to know in a bit more detail, and not always 'sample' + # e.g.: :::: ... SERUM::IgG::Abbott + #note: What about measurements (e.g. serum anti-IgG to SARs-CoV-2)? + # What about drug exposures (e.g. vaccine AZ)? + # What about medicine given (e.g. diazepam) ? + # What about observations (e.g. smoker)? + physicalSampleAvailability: Optional[CommaSeparatedValues] = Field( + None, + description="A list of what the dataset actually contains in terms of sample measurements", + example="DNA,PLASMA,SERUM,URINE,WHOLE BLOOD", + title='Physical Sample Availability' + ) + + #note: missing coverage of the types of people the dataset is covering? + # diabetes, cancer, care home residents, smokers, etc. etc. + # i.e. not just the 'sample' + # + #Could do something OMOP-like: + # measurements: Optional[CommaSeparatedValues] + # drugs: Optional[CommaSeparatedValues] + # observations: Optional[CommaSeparatedValues] + + #note: is this appropriate in this coverage section? + pathway: Optional[LongDescription] = Field( + None, + description="Long description of the clinical/diagnostic/treatment pathway if applicable. This could include if the dataset is from a single speciality or area, a single tier of care, linked across two tier (e.g. primary and secondary care), or an integrated care record covering the whole patient pathway.", + example= "The lookup contains references to link data held elsewhere on:\n• individuals appearing as defendants in criminal cases dealt with by the magistrates' or Crown Court in England and Wales (including Youth Courts). \n• individuals supervised by the probation service in England and Wales\n• individuals serving custodial sentences in England & Wales who appear within records from the prison data source, p-NOMIS. Young Offenders are included if resident at prisons or Young Offender Institutes (YOIs) that use p-NOMIS, however, this excludes the majority of Secure Schools and Secure Training Centres. \"\n\n\"The linking dataset includes a person ID and link to record in other data first datasets for: \n• Disposals in the magistrates’ court from 1 January 2011 to 31 December 2020\n• Disposals in the Crown Court from 1 January 2013 to 31 December 2020\n• Custodial sentences of offenders in custody from January 2011 to September 2021 (including sentences begun before 2011) \n• Offender probation records from January 2014 to December 2020.", + title='Pathway' + ) + + #note: May need to update/modify: + # https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/Followup.py + followup: Optional[Followup] = Field( + None, + description="What is the typical time span that a patient appears in the dataset (follow up period)", + example="CONTINUOUS", + title='Followup' + ) + + #note: not sure if this is the best way of doing it + # ask for age: low, median, high instead? + # allowing [0-150] is not useful ... + typicalAgeRange: Optional[AgeRange] = Field( + None, + description="Age range in whole years of participants in the dataset. Please provide range in the following format '[min age] – [max age]' where both the minimum and maximum are whole numbers (integers).", + example="1-150", + title='Typical Age Range' + ) + diff --git a/hdr_schemata/models/GWDM/base/DataColumn.py b/hdr_schemata/models/GWDM/base/DataColumn.py new file mode 100644 index 0000000..7578f15 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/DataColumn.py @@ -0,0 +1,38 @@ +from typing import Optional, List +from pydantic import BaseModel, Field, constr +from hdr_schemata.definitions.HDRUK import * + +from .DataValue import DataValue + +class DataColumn(BaseModel): + class Config: + extra = 'allow' + + name: Name = Field( + ..., + description='The name of a column in a table.', + title='Column Name' + ) + dataType: str = Field( + ..., + description='The data type of values in the column', + title='Data Type' + ) + + description: Optional[constr(min_length=1, max_length=20000)] = Field( + None, + description='A description of a column in a table.', + title='Column Description', + ) + + sensitive: bool = Field( + ..., + description='A True or False value, indicating if the field is sensitive or not', + title='Sensitive', + ) + + values: Optional[List[DataValue]] = Field( + ..., + description='Data values contained within the column', + title='Values', + ) diff --git a/hdr_schemata/models/GWDM/base/DataTable.py b/hdr_schemata/models/GWDM/base/DataTable.py new file mode 100644 index 0000000..0981578 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/DataTable.py @@ -0,0 +1,25 @@ +from typing import Optional, List +from pydantic import BaseModel, Field, constr +from hdr_schemata.definitions.HDRUK import * + +from .DataColumn import DataColumn + +class DataTable(BaseModel): + class Config: + extra = 'forbid' + + name: Optional[constr(min_length=1, max_length=500)] = Field( + ..., + description='The name of a table in a dataset.', + title='Table Name' + ) + description: Optional[constr(min_length=1, max_length=20000)] = Field( + None, + description='A description of a table in a dataset.', + title='Table Description', + ) + columns: List[DataColumn] = Field( + ..., + description='A list of columns contained within a table in a dataset.', + title='Data Columns', + ) diff --git a/hdr_schemata/models/GWDM/base/DataValue.py b/hdr_schemata/models/GWDM/base/DataValue.py new file mode 100644 index 0000000..b19f693 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/DataValue.py @@ -0,0 +1,26 @@ +from typing import Optional +from pydantic import BaseModel, Field, constr +from hdr_schemata.definitions.HDRUK import * + +class DataValue(BaseModel): + class Config: + extra = 'allow' + + name: Name = Field( + ..., + description='Unique value in a column .', + title='Value Name' + ) + + description: Optional[constr(min_length=1, max_length=20000)] = Field( + None, + description='A description of a unique value in a column.', + title='Value Description', + ) + + frequency: Optional[int] = Field( + None, + description='The frequency of occurrance of a value in a column', + title='Value Frequency', + ) + diff --git a/hdr_schemata/models/GWDM/base/DatasetLinkage.py b/hdr_schemata/models/GWDM/base/DatasetLinkage.py new file mode 100644 index 0000000..fbc6d1f --- /dev/null +++ b/hdr_schemata/models/GWDM/base/DatasetLinkage.py @@ -0,0 +1,38 @@ +from typing import Optional, Union, List +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +class DatasetLinkage(BaseModel): + class Config: + extra = 'forbid' + + isDerivedFrom: Optional[CommaSeparatedValues] = Field( + None, + description="Indicate if derived datasets or predefined extracts are available and the type of derivation available. Notes. Single or multiple dimensions can be provided as a derived extract alongside the dataset", + example="Data will be minimised as appropriate relative to the data access application", + title='Derivations', + ) + + #note: this could be greatly improved - link with DataCollections or other Dataset objects? + isPartOf: Optional[CommaSeparatedValues] = Field( + None, + description='If the dataset is part of a group or family', + example="UKCRC Tissue Directory and Coordination Centre", + title='Is PartOf', + ) + + #note: why was this included? Need to ask Damon as it's an 'extra' + isMemberOf: Optional[CommaSeparatedValues] = Field( + None, + description='Dataset is a member of XXX(?)', + title='Is MemberOf', + ) + + #note: current data is nonsensical... + # make better use out of this my linking to urls or gatewayIDs of other datasets? + linkedDatasets: Optional[CommaSeparatedValues] = Field( + None, + description='Links to other datasets.', + example="Yes. To any SAIL dataset & reference data.,ALL", + title='Linked Datasets', + ) diff --git a/hdr_schemata/models/GWDM/base/FormatAndStandards.py b/hdr_schemata/models/GWDM/base/FormatAndStandards.py new file mode 100644 index 0000000..4e67276 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/FormatAndStandards.py @@ -0,0 +1,44 @@ +from typing import Optional, Union, List +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +class FormatAndStandards(BaseModel): + class Config: + extra = 'forbid' + + #note: should be limited to allowed values? + # see: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/ControlledVocabularyEnum.py + vocabularyEncodingSchemes: Optional[CommaSeparatedValues] = Field( + ..., + description='Code value of the ontology vocabulary encoding', + example="OPCS4,NHS NATIONAL CODES,ICD10,OTHER", + title='Controlled Vocabulary', + ) + + #note: dont really know what this means, conforms to 'NHS DATA DICTIONARY' what does that mean?!? + conformsTo: Optional[CommaSeparatedValues] = Field( + ..., + description='What the vocabulary conforms to.', + example="LOCAL,NHS DATA DICTIONARY", + title='Conforms To', + ) + + #note: may need to limit these (en,fr,es,po,..) instead of arbitrary CommaSeparatedValues + # see: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/LanguageEnum.py + languages: Optional[CommaSeparatedValues] = Field( + ..., + description='Language code(s) of the language of the dataset metadata and underlying data is made available.', + example="en", + title='Language Code(s)', + ) + + #note: this is surely dependent on the AccessService? + # e.g. SAIL will let you access via SQL only + #note: may need to limit these to specific Enum formats instead of CommaSeparatedValues + # 'SQL database table' --> some really poor choices of 'formats' + formats: Optional[CommaSeparatedValues] = Field( + ..., + description='Format(s) the dataset can be made available in', + example="CSV,JSON,SQL database table", + title='Dataset Format', + ) diff --git a/hdr_schemata/models/GWDM/base/Linkage.py b/hdr_schemata/models/GWDM/base/Linkage.py new file mode 100644 index 0000000..aa30c43 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Linkage.py @@ -0,0 +1,63 @@ +from typing import Optional, Union, List +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +from .DatasetLinkage import DatasetLinkage + +class Linkage(BaseModel): + class Config: + extra = 'forbid' + + #note: this is a new field + # what are we going to do with it? + isGeneratedUsing: Optional[CommaSeparatedValues] = Field( + None, + description='??', + title='Is Generated Using' + ) + + #note: may need to be commad separated list of URLs? + associatedMedia: Optional[CommaSeparatedValues] = Field( + None, + description='Any media associated with the Gateway Organisation using a valid URI for the content. This is an opportunity to provide additional context that could be useful for researchers wanting to understand more about the dataset and its relevance to their research question', + example= "https://popdatasci.swan.ac.uk/centres-of-excellence/sail/,https://www.youtube.com/watch?v=ZK9-Jw3uVkw,https://saildatabank.com/,https://saildatabank.com/about-us/", + title='Associated Media' + ) + + #note: new field - what are we going to do with it?? + dataUses: Optional[CommaSeparatedValues] = Field( + None, + description='??', + title='Data Uses' + ) + + #note: dont we have this already somewhere else? Linked DOIs? + isReferenceIn: Optional[CommaSeparatedValues] = Field( + None, + description='Rhe keystone paper associated with the dataset. Also include a list of known citations, if available and s\ +hould be links to existing resources where the dataset has been used or referenced.', + title='Is Reference in' + ) + + #note: limit this is comma separated values of URLs? + tools: Optional[CommaSeparatedValues] = Field( + None, + description='URL of any analysis tools or models that have been created for this dataset and are available for further use', + example="https://conceptlibrary.saildatabank.com/", + title='Tools', + ) + + datasetLinkage: Optional[DatasetLinkage] = Field( + None, + description='Dataset Linkage copied over from', + title='Dataset Linkage', + ) + + #note: something wrong with this description and/or something needs updating with what this is needed for... + investigations: Optional[CommaSeparatedValues] = Field( + None, + description='Please provide the keystone paper associated with the dataset.', + example="https://digital.nhs.uk/services/data-access-request-service-dars/register-of-approved-data-releases", + title='Investigations' + ) + diff --git a/hdr_schemata/models/GWDM/base/Observations.py b/hdr_schemata/models/GWDM/base/Observations.py new file mode 100644 index 0000000..b817c58 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Observations.py @@ -0,0 +1,36 @@ +from datetime import datetime, date +from typing import Optional, Union, List +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + + +class Observation(BaseModel): + class Config: + extra = 'forbid' + + observedNode: StatisticalPopulationConstrained = Field( + ..., + description='Please select one of the following statistical populations for you observation', + examples=['PERSONS'], + title='Statistical Population', + ) + measuredValue: int = Field( + ..., + description='Please provide the population size associated with the population type the dataset i.e. 1000 people in a study, or 87 images (MRI) of Knee Usage Note: Used with Statistical Population, which specifies the type of the population in the dataset.', + title='Measured Value', + ) + disambiguatingDescription: Optional[AbstractText] = Field( + None, + description='If SNOMED CT term does not provide sufficient detail, please provide a description that disambiguates the population type.', + title='Disambiguating Description', + ) + observationDate: Union[date, datetime] = Field( + ..., + description='Please provide the date that the observation was made. Some datasets may be continuously updated and the number of records will change regularly, so the observation date provides users with the date that the analysis or query was run to generate the particular observation. Multiple observations can be made i.e. an observation of cumulative COVID positive cases by specimen on the 1/1/2021 could be 2M. On the 8/1/2021 a new observation could be 2.1M. Users can add multiple observations.', + title='Observation Date', + ) + measuredProperty: MeasuredProperty = Field( + ..., + description='Initially this will be defaulted to "COUNT"', + title='Measured Property', + ) diff --git a/hdr_schemata/models/GWDM/base/Origin.py b/hdr_schemata/models/GWDM/base/Origin.py new file mode 100644 index 0000000..1af74ba --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Origin.py @@ -0,0 +1,33 @@ +from typing import Optional, List, Union +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +class Origin(BaseModel): + class Config: + extra = 'forbid' + + #note: shall we update to limit to: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/Purpose.py + purpose: Optional[CommaSeparatedValues] = Field( + None, + description='Indicates the purpose(s) that the dataset was collected.', + example='ADMINISTRATIVE,STATUTORY', + title='Purpose', + ) + + #note: update to limit to: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/Source.py + source: Optional[CommaSeparatedValues] = Field( + None, + description='Indicates the source of the data extraction', + example= "PAPER BASED,ELECTRONIC SURVEY", + title='Source', + ) + + #note: update to limit to: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/Setting.py + collectionSituation: Optional[CommaSeparatedValues] = Field( + None, + description='Indicate the setting(s) where data was collected. Multiple settings may be provided', + example="IN-PATIENTS,PRIMARY CARE", + title='Setting', + ) + + diff --git a/hdr_schemata/models/GWDM/base/Provenance.py b/hdr_schemata/models/GWDM/base/Provenance.py new file mode 100644 index 0000000..1866455 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Provenance.py @@ -0,0 +1,13 @@ +from typing import Optional +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +from .Origin import Origin +from .Temporal import Temporal + +class Provenance(BaseModel): + class Config: + extra = 'forbid' + + origin: Optional[Origin] = None + temporal: Temporal diff --git a/hdr_schemata/models/GWDM/base/Publisher.py b/hdr_schemata/models/GWDM/base/Publisher.py new file mode 100644 index 0000000..807d5df --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Publisher.py @@ -0,0 +1,21 @@ +from typing import Optional +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr +from hdr_schemata.definitions.HDRUK import * + + +class Publisher(BaseModel): + + publisherName: Optional[Name] = Field( + ..., + description="The organisation responsible for running or supporting the data access request process, as well as publishing and maintaining the metadata. In most this will be the same as the HDR UK Organisation (Hub or Alliance Member)/", + example="SAIL", + title='Publisher name' + ) + + #note: will need to do something about this in the future + # should match a pattern? sha256? integer? + publisherGatewayId: Optional[constr(min_length=2,max_length=50)] = Field( + None, + description="The link to an ID somewhere in the gateway where more information on the publisher can be retrieved.", + title='Publisher gateway id' + ) diff --git a/hdr_schemata/models/GWDM/base/Required.py b/hdr_schemata/models/GWDM/base/Required.py new file mode 100644 index 0000000..fae4235 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Required.py @@ -0,0 +1,38 @@ +from datetime import date, datetime +from enum import Enum +from typing import List, Optional, Union + +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr + +from hdr_schemata.definitions.HDRUK import * + +from .Revision import Revision + +class Required(BaseModel): + gatewayId: constr(min_length=2,max_length=50) = Field( + ..., + description='Need a field in Mauro that captures the datasetID to link to gateway database - or can we just use the one created in Mauro?', + title='Gatewayid', + ) + gatewayPid: constr(min_length=2,max_length=50) = Field( + ..., + description='Need a field in Mauro that captures the dataset pid to link to gateway database', + title='Gatewaypid', + ) + issued: datetime = Field( + ..., + description="Aren't issued and modified always the same because of versioning? Is that fine to duplicate because datasets in dcat might look different?", + title='Issued', + ) + modified: datetime = Field( + ..., + description="Aren't issued and modified always the same because of versioning? Is that fine to duplicate because datasets in dcat might look different?", + title='Modified', + ) + + #note: do we also need to include a 'latest'? + revisions: List[Revision] = Field( + ..., + title='Revisions') + + diff --git a/hdr_schemata/models/GWDM/base/Revision.py b/hdr_schemata/models/GWDM/base/Revision.py new file mode 100644 index 0000000..2952a78 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Revision.py @@ -0,0 +1,19 @@ +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr +from hdr_schemata.definitions.HDRUK import * + + +class Revision(BaseModel): + + version: constr(min_length=2,max_length=100) = Field( + ..., + description='Version number used for previous version of this dataset', + example='6.0.0', + title='revision version' + ) + + url: Url = Field( + ..., + description='Some url with a reference to the record of a previous version of this dataset', + example='https://api.service.nhs.uk/health-research-data-catalogue/datasetrevisions/841f7da2-b018-41f6-b4ae-2e0aadab6561', + title='revision url' + ) diff --git a/hdr_schemata/models/GWDM/base/Summary.py b/hdr_schemata/models/GWDM/base/Summary.py new file mode 100644 index 0000000..047de76 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Summary.py @@ -0,0 +1,82 @@ +from datetime import date, datetime +from enum import Enum +from typing import List, Optional, Union + +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr + +from hdr_schemata.definitions.HDRUK import * + +from .Publisher import Publisher + +class Summary(BaseModel): + + title: TwoHundredFiftyFiveCharacters = Field( + ..., + description='The main title of the dataset', + example="Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations", + title='Title' + ) + + shortTitle: Optional[ShortTitle] = Field( + ..., + description='A shorter descriptive title of the dataset', + example="ONS 2011 Census Wales (CENW)", + title='Shorttitle' + ) + + doiName: Optional[Doi] = Field( + ..., + description="DOI associated to this dataset", + example="10.1093/ije/dyx196", + title='Doiname' + ) + + abstract: LongAbstractText = Field( + ..., + description="Longer abstract detailing the dataset.", + example="COVID-19 Key Worker Testing Results data is required by NHS Digital to support COVID-19 requests for linkage, analysis and dissemination to other organisations who require the data in a timely manner.", + title='Abstract' + ) + + keywords: Optional[CommaSeparatedValues] = Field( + ..., + description="Comma separated key words associated to this dataset.", + example="Preprints,Papers,HDR UK", + title='Keywords' + ) + + #note: do we want to limit these values by Enums? + controlledKeywords: Optional[CommaSeparatedValues] = Field( + ..., + description="Keywords that have been filtered and limited", + title='Controlled Keywords' + ) + + contactPoint: Optional[EmailStr] = Field( + ..., + description='email of a person who can be the main contact point of this dataset', + example="susheel.varma@hdruk.ac.uk", + title='Contact Point' + ) + + #note: new addition added by Damon.. may need to revisit what this should be? + # should be Enums? + datasetType: Optional[DatasetType] = Field( + ..., + description="What type of dataset is this?", + title='Dataset type' + ) + + description: Optional[LongDescription] = Field( + ..., + description="Longer description of the dataset in detail", + example="Publications that mention HDR-UK (or any variant thereof) in Acknowledgements or Author Affiliations\n\nThis will include:\n- Papers\n- COVID-19 Papers\n- COVID-19 Preprint", + title='Description' + ) + + publisher: Optional[Publisher] = Field( + ..., + description="Link to details about the publisher of this dataset", + title='Publisher', + ) + diff --git a/hdr_schemata/models/GWDM/base/Temporal.py b/hdr_schemata/models/GWDM/base/Temporal.py new file mode 100644 index 0000000..d5f9b77 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Temporal.py @@ -0,0 +1,42 @@ +from datetime import date,datetime +from typing import Optional, List, Union +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + + +class Temporal(BaseModel): + class Config: + extra = 'forbid' + + + startDate: Optional[Union[date, datetime]] = Field( + ..., + description='The start of the time period that the dataset provides coverage for', + example='12/03/2020', + title='Start Date', + ) + endDate: Optional[Union[date, datetime]] = Field( + None, + description='The end of the time period that the dataset provides coverage for', + example='12/03/2020', + title='End Date', + ) + timeLag: TimeLag = Field( + ..., + description='Rypical time-lag between an event and the data for that event appearing in the dataset', + example="LESS 1 WEEK", + title='Time Lag', + ) + + accrualPeriodicity: Periodicity = Field( + ..., + description='frequency of distribution release. If a dataset is distributed regularly please choose a distribution release periodicity from the constrained list and indicate the next release date. When the release date becomes historical, a new release date will be calculated based on the publishing periodicity.', + example="MONTHLY", + title='Periodicity', + ) + + distributionReleaseDate: Optional[Union[date, datetime]] = Field( + None, + description='Date of the latest release of the dataset. If this is a regular release i.e. quarterly, or this is a static dataset please complete this alongside Periodicity.', + title='Release Date', + ) diff --git a/hdr_schemata/models/GWDM/base/Usage.py b/hdr_schemata/models/GWDM/base/Usage.py new file mode 100644 index 0000000..d3a086a --- /dev/null +++ b/hdr_schemata/models/GWDM/base/Usage.py @@ -0,0 +1,37 @@ +from typing import Optional, Union, List +from pydantic import BaseModel, Field +from hdr_schemata.definitions.HDRUK import * + +class Usage(BaseModel): + class Config: + extra = 'forbid' + + #note: is this useful? Is there not a better way of doing this? + # is it better as a part of the AccessService? + # should it be a CommaSeparatedValues where the values are limited? + # Optional[CommaSeparated[DataUseLimitation]] + # see: https://github.com/HDRUK/schemata-2/blob/master/hdr_schemata/definitions/HDRUK/DataUseLimitation.py + dataUseLimitation: Optional[CommaSeparatedValues] = Field( + ..., + description='Any restrictions to its usage', + example="GENERAL RESEARCH USE,PROJECT SPECIFIC RESTRICTIONS", + title='Data Use Limitation', + ) + + #note: exisitng metadata is referencing 'restrictions' - are these not Limitations too? + # as above, this could be limited to allowed Enum values? Instead of arbitrary CommaSeparatedValues + dataUseRequirement: Optional[CommaSeparatedValues] = Field( + ..., + description='Any requirements needed for data usage', + example="PROJECT SPECIFIC RESTRICTIONS,TIME LIMIT ON USE,USER SPECIFIC RESTRICTION", + title='Data Use Requirements', + ) + + #note: wouldnt be want to link this to an Organisation/Person/DataCustodian Account? + # we are we letting this just be a ShortDescription? + resourceCreator: Optional[ShortDescription] = Field( + None, + description='Who has created this resource', + example="Ministry of Justice", + title='Resource Creator', + ) diff --git a/hdr_schemata/models/GWDM/base/__init__.py b/hdr_schemata/models/GWDM/base/__init__.py new file mode 100644 index 0000000..4b043b7 --- /dev/null +++ b/hdr_schemata/models/GWDM/base/__init__.py @@ -0,0 +1,66 @@ +from datetime import date, datetime +from enum import Enum +from typing import List, Optional, Union + +from pydantic import AnyUrl, BaseModel, EmailStr, Field, constr + +from hdr_schemata.definitions.HDRUK import * + +from .Required import Required +from .Summary import Summary +from .Coverage import Coverage +from .Provenance import Provenance +from .Accessibility import Accessibility +from .Linkage import Linkage +from .Observations import Observation +from .DataTable import DataTable + +class GwdmBaseModel(BaseModel): + class Config: + extra = 'forbid' + + required: Required = Field( + ..., + description='required metadata needed for the GWDM', + title='Required' + ) + + summary: Summary = Field( + ..., + description='Summary of metadata describing key pieces of information.', + title='Summary', + ) + + coverage: Optional[Coverage] = Field( + description='Spatial and Temporal coverage', + title='Coverage', + ) + + provenance: Optional[Provenance] = Field( + None, + description='Provenance information', + title='Provenance', + ) + + accessibility: Accessibility = Field( + ..., + description='Accessibility information.', + title='Accessibility', + ) + + linkage: Optional[Linkage] = Field( + None, + description='Linkage and enrichment.', + title='Linkage', + ) + + observations: List[Observation] = Field( + ..., + description='Obsservations', + title='Observations', + ) + structuralMetadata: Optional[List[DataTable]] = Field( + None, + description='Descriptions of all tables and data elements that can be included in the dataset', + title='Structural Metadata', + ) diff --git a/hdr_schemata/models/GWDM/v1_0.py b/hdr_schemata/models/GWDM/v1_0.py new file mode 100644 index 0000000..0f84838 --- /dev/null +++ b/hdr_schemata/models/GWDM/v1_0.py @@ -0,0 +1,8 @@ +from hdr_schemata.models.GWDM.base import GwdmBaseModel +import json + +class Gwdm10(GwdmBaseModel): + @classmethod + def save_schema(cls,location='./1.0/schema.json'): + with open(location,'w') as f: + json.dump(cls.model_json_schema(),f,indent=6) diff --git a/hdr_schemata/models/HDRUK/__init__.py b/hdr_schemata/models/HDRUK/__init__.py new file mode 100644 index 0000000..043ec4e --- /dev/null +++ b/hdr_schemata/models/HDRUK/__init__.py @@ -0,0 +1 @@ +from .v2_1_2 import Hdruk212 diff --git a/hdr_schemata/tests/test_schemas.py b/hdr_schemata/tests/test_schemas.py new file mode 100644 index 0000000..7fc6e35 --- /dev/null +++ b/hdr_schemata/tests/test_schemas.py @@ -0,0 +1,45 @@ +from pydantic import ValidationError +import json + +from hdr_schemata.models.HDRUK import Hdruk212 +from hdr_schemata.models.GWDM import Gwdm10 + +def get_metadata(model,version): + metadata = json.load(open(f'../examples/{model}/{version}/example.json')) + return metadata + +def get_schema(model,version): + metadata = json.load(open(f'../models/{model}/{version}/schema.json')) + return metadata + +class TestHdruk212: + metadata = get_metadata('HDRUK','2.1.2') + json_schema = get_schema('HDRUK','2.1.2') + + def test_validation(self): + assert Hdruk212(**self.metadata) != None + + def test_json_schema(self): + schema = Hdruk212.model_json_schema() + expected_keys = ['$defs', 'additionalProperties', 'properties', 'required', 'title', 'type'] + assert list(schema.keys()) == expected_keys + assert schema == self.json_schema + + +class TestGwdm10: + metadata = get_metadata('GWDM','1.0') + json_schema = get_schema('GWDM','1.0') + + def test_validation(self): + assert Gwdm10(**self.metadata) != None + + def test_json_schema(self): + schema = Gwdm10.model_json_schema() + expected_keys = ['$defs', 'additionalProperties', 'properties', 'required', 'title', 'type'] + assert list(schema.keys()) == expected_keys + + def test_json_schema(self): + schema = Gwdm10.model_json_schema() + expected_keys = ['$defs', 'additionalProperties', 'properties', 'required', 'title', 'type'] + assert list(schema.keys()) == expected_keys + assert schema == self.json_schema diff --git a/setup.py b/setup.py index ce5d4d0..a5003b0 100644 --- a/setup.py +++ b/setup.py @@ -20,12 +20,13 @@ url="https://github.com/HDRUK/schemata-2", packages=setuptools.find_packages(), install_requires=[ - "pydantic", + "pydantic[email]==2.0.3", + "pytest==7.4.2" ], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires='>=3.8', + python_requires='>=3.9', )