diff --git a/content/json_schema/AnVILBioCoreMinimal.schema.json b/content/json_schema/AnVILBioCoreMinimal.schema.json index 7d93443..ffe81e2 100644 --- a/content/json_schema/AnVILBioCoreMinimal.schema.json +++ b/content/json_schema/AnVILBioCoreMinimal.schema.json @@ -9,7 +9,7 @@ "type": "string" }, "apriori_cell_type": { - "description": "A priori cell type(s) for the sample, a human assignment of cell type.", + "description": "A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.", "items": { "type": "string" }, @@ -19,7 +19,7 @@ "type": "string" }, "biosample_type": { - "description": "The type of biosample represented by the record.", + "description": "The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.", "type": "string" }, "disease": { @@ -57,12 +57,16 @@ "type": "string" }, "genetic_ancestry": { - "description": "A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data.", + "description": "A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.", "items": { "type": "string" }, "type": "array" }, + "human_phenotype": { + "$ref": "#/$defs/HumanPhenotypeOntology", + "description": "phenotype code from the human phenotype ontology (HPO)" + }, "organism_type": { "description": "A reference to the organism type.", "type": "string" @@ -72,7 +76,7 @@ "type": "string" }, "reported_ethnicity": { - "description": "A property that relects a Human Donor's reported ethnic origins.", + "description": "A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data", "items": { "type": "string" }, @@ -132,6 +136,16 @@ ], "title": "AnvilFile", "type": "object" + }, + "HumanPhenotypeOntology": { + "description": "", + "enum": [ + "ALIVE", + "DEAD", + "UNKNOWN" + ], + "title": "HumanPhenotypeOntology", + "type": "string" } }, "$id": "https://github.com/DataBiosphere/biocore-data-model/tree/main/content", diff --git a/content/linkml/AnVILBioCoreMinimal.linkml.yaml b/content/linkml/AnVILBioCoreMinimal.linkml.yaml index 2ebbb6c..44a5c2e 100644 --- a/content/linkml/AnVILBioCoreMinimal.linkml.yaml +++ b/content/linkml/AnVILBioCoreMinimal.linkml.yaml @@ -4,6 +4,7 @@ id: https://github.com/DataBiosphere/biocore-data-model/tree/main/content prefixes: linkml: https://w3id.org/linkml/ anvil: https://anvilproject.org/ + hpo: https://hpo.jax.org/app/browse/term/ default_prefix: anvil imports: - linkml:types # this imports the linkml types schema @@ -33,6 +34,7 @@ classes: - phenotypic_sex - reported_ethnicity - genetic_ancestry + - human_phenotype AnvilFile: description: Information for files associated with the study. slots: @@ -137,4 +139,13 @@ slots: reference_assembly: aliases: usesReferenceAssembly description: A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc. - multivalued: true \ No newline at end of file + multivalued: true + + human_phenotype: + description: phenotype code from the human phenotype ontology (HPO) + range: HumanPhenotypeOntology + +enums: + HumanPhenotypeOntology: + permissible_values: + All: diff --git a/content/tools/python/.ipynb_checkpoints/convert_to_tdr_schema-checkpoint.ipynb b/content/tools/python/.ipynb_checkpoints/convert_to_tdr_schema-checkpoint.ipynb new file mode 100644 index 0000000..b4ce2ed --- /dev/null +++ b/content/tools/python/.ipynb_checkpoints/convert_to_tdr_schema-checkpoint.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f87a430c", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'group'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 76\u001b[0m\n\u001b[1;32m 74\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput/AnVILBioCoreMinimal.tdr.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# Execution\u001b[39;00m\n\u001b[0;32m---> 76\u001b[0m \u001b[43mconvert_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[1], line 29\u001b[0m, in \u001b[0;36mconvert_schema\u001b[0;34m(input_file, output_file)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column, column_def \u001b[38;5;129;01min\u001b[39;00m properties\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Record foreign keys\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 29\u001b[0m final_column_name \u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m(^.*)_fk$\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m(\u001b[38;5;241m1\u001b[39m) \n\u001b[1;32m 30\u001b[0m target_table \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m$defs\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m/(.*)$\u001b[39m\u001b[38;5;124m\"\u001b[39m, column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 31\u001b[0m foreign_keys\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: table, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_column\u001b[39m\u001b[38;5;124m\"\u001b[39m: final_column_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: target_table})\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'" + ] + } + ], + "source": [ + "# Imports\n", + "import json\n", + "import re\n", + "\n", + "# Functions\n", + "def convert_schema(input_file, output_file):\n", + " # Read in input file\n", + " with open(input_file) as infile:\n", + " input_schema = json.load(infile)\n", + "\n", + " # Parse and convert input file\n", + " output_schema = {}\n", + " table_list = []\n", + " relationship_list = []\n", + " primary_keys = {}\n", + " foreign_keys = []\n", + " if input_schema.get(\"$defs\"):\n", + " # Loop through tables and build table objects\n", + " for table, table_def in input_schema[\"$defs\"].items():\n", + " required_fields = table_def.get(\"required\")\n", + " primary_keys[table] = required_fields[0]\n", + " properties = table_def.get(\"properties\")\n", + " column_list = []\n", + " if properties:\n", + " # Loop through columns and build column objects\n", + " for column, column_def in properties.items():\n", + " # Record foreign keys\n", + " if column_def.get(\"$ref\"):\n", + " final_column_name = re.search(r\"(^.*)_fk$\", column).group(1) \n", + " target_table = re.search(r\"\\$defs\\/(.*)$\", column_def.get(\"$ref\")).group(1)\n", + " foreign_keys.append({\"from_table\": table, \"from_column\": final_column_name, \"to_table\": target_table})\n", + " else:\n", + " final_column_name = column\n", + " # Build column object\n", + " array_of = False\n", + " required = True if column in required_fields else False\n", + " initial_data_type = column_def.get(\"type\")\n", + " if initial_data_type == \"array\":\n", + " final_data_type = column_def[\"items\"].get(\"type\")\n", + " array_of = True\n", + " elif initial_data_type:\n", + " final_data_type = initial_data_type\n", + " else:\n", + " final_data_type = \"string\"\n", + " column_dict = {\"name\": final_column_name, \"datatype\": final_data_type, \"array_of\": array_of, \"required\": required}\n", + " column_list.append(column_dict) \n", + " primary_key = required_fields if required_fields else []\n", + " table_dict = {\"name\": table, \"columns\": column_list, \"primaryKey\": primary_key}\n", + " table_list.append(table_dict)\n", + "\n", + " # Loop through recorded foreign keys and build relationship objects\n", + " for fk_entry in foreign_keys:\n", + " from_table = fk_entry[\"from_table\"]\n", + " from_column = fk_entry[\"from_column\"]\n", + " to_table = fk_entry[\"to_table\"]\n", + " to_column = primary_keys.get(to_table)\n", + " rel_name = from_table + \".\" + from_column + \"_to_\" + to_table + \".\" + to_column\n", + " relationship_dict = {\"name\": rel_name, \"from\": {\"table\": from_table, \"column\": from_column}, \"to\": {\"table\": to_table, \"column\": to_column}}\n", + " relationship_list.append(relationship_dict)\n", + "\n", + " # Add table and relationship objects to output schema\n", + " if table_list:\n", + " output_schema[\"tables\"] = table_list\n", + " if relationship_list:\n", + " output_schema[\"relationships\"] = relationship_list\n", + "\n", + " # Write out output file\n", + " with open(output_file, \"w\") as outfile:\n", + " json.dump(output_schema, outfile)\n", + "\n", + "# Parameters\n", + "file_path = \"/home/cox/git/biocore-data-model/content/\"\n", + "input_file = file_path + \"json_schema/AnVILBioCoreMinimal.schema.json\"\n", + "output_file = \"output/AnVILBioCoreMinimal.tdr.json\"\n", + "# Execution\n", + "convert_schema(input_file, output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1ea088c9-ef70-47ef-b1c2-d055df8da6ca", + "metadata": {}, + "outputs": [], + "source": [ + " with open(input_file) as infile:\n", + " input_schema = json.load(infile)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8e9ee64d-2fbc-424b-b93d-0c4b472f864a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'$defs': {'AnvilBioSample': {'additionalProperties': False,\n", + " 'description': 'Contains information about the sample(s) included in the study.',\n", + " 'properties': {'anatomical_site': {'description': 'A reference to the site within the organism from which the BioSample was taken.',\n", + " 'type': 'string'},\n", + " 'apriori_cell_type': {'description': 'A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'biosample_id': {'type': 'string'},\n", + " 'biosample_type': {'description': 'The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.',\n", + " 'type': 'string'},\n", + " 'disease': {'description': 'A property that identifies a disease or condition has been reported in this entity.',\n", + " 'type': 'string'},\n", + " 'donor_age_at_collection_lower_bound': {'description': 'Lower bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", + " 'type': 'number'},\n", + " 'donor_age_at_collection_unit': {'description': 'The units (e.g. years or days) of the Age of the Donor at the point in time that the BioSample was obtained or other representative entity (test, diagnosis, treatment...) was created.',\n", + " 'type': 'string'},\n", + " 'donor_age_at_collection_upper_bound': {'description': 'Upper bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", + " 'type': 'number'},\n", + " 'donor_id_fk': {'$ref': '#/$defs/AnvilDonor',\n", + " 'description': 'This property references the Donor organism from which the BioSample was acquired.'}},\n", + " 'required': ['biosample_id'],\n", + " 'title': 'AnvilBioSample',\n", + " 'type': 'object'},\n", + " 'AnvilDonor': {'additionalProperties': False,\n", + " 'description': 'Demographic and phenotypic information about the donor.',\n", + " 'properties': {'donor_id': {'type': 'string'},\n", + " 'genetic_ancestry': {'description': \"A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.\",\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'human_phenotype': {'$ref': '#/$defs/HumanPhenotypeOntology',\n", + " 'description': 'phenotype code from the human phenotype ontology (HPO)'},\n", + " 'organism_type': {'description': 'A reference to the organism type.',\n", + " 'type': 'string'},\n", + " 'phenotypic_sex': {'description': 'A reference to the BiologicalSex of the Donor organism. \\\\\"An organismal quality inhering in a bearer by virtue of the bearer\\'s physical expression of sexual characteristics. [PATO_0001894]\\\\',\n", + " 'type': 'string'},\n", + " 'reported_ethnicity': {'description': \"A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data\",\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'}},\n", + " 'required': ['donor_id'],\n", + " 'title': 'AnvilDonor',\n", + " 'type': 'object'},\n", + " 'AnvilFile': {'additionalProperties': False,\n", + " 'description': 'Information for files associated with the study.',\n", + " 'properties': {'data_modality': {'description': 'Data modality describes the biological nature of the information gathered as the result of an Activity, independent of the technology or methods used to produce the information.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'file_format': {'description': 'An indication of the format of an electronic file; include the full file extension including compression extensions.',\n", + " 'type': 'string'},\n", + " 'file_id': {'type': 'string'},\n", + " 'file_md5sum': {'description': 'md5 checksum for the file',\n", + " 'type': 'string'},\n", + " 'file_name': {'description': 'The name of the file.', 'type': 'string'},\n", + " 'file_ref': {'description': 'The fully qualified path to the file.',\n", + " 'type': 'string'},\n", + " 'file_size': {'description': 'Property that describes the approximate size of a file in megabytes.',\n", + " 'type': 'integer'},\n", + " 'reference_assembly': {'description': 'A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'}},\n", + " 'required': ['file_id'],\n", + " 'title': 'AnvilFile',\n", + " 'type': 'object'},\n", + " 'HumanPhenotypeOntology': {'description': '',\n", + " 'enum': ['ALIVE', 'DEAD', 'UNKNOWN'],\n", + " 'title': 'HumanPhenotypeOntology',\n", + " 'type': 'string'}},\n", + " '$id': 'https://github.com/DataBiosphere/biocore-data-model/tree/main/content',\n", + " '$schema': 'https://json-schema.org/draft/2019-09/schema',\n", + " 'additionalProperties': True,\n", + " 'metamodel_version': '1.7.0',\n", + " 'title': 'AnVILBioCoreMinimal',\n", + " 'type': 'object',\n", + " 'version': None}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41dd7586-3d45-4edb-8843-1098c49a2ba8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/content/tools/python/convert_to_tdr_schema.ipynb b/content/tools/python/convert_to_tdr_schema.ipynb new file mode 100644 index 0000000..b4ce2ed --- /dev/null +++ b/content/tools/python/convert_to_tdr_schema.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f87a430c", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'group'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 76\u001b[0m\n\u001b[1;32m 74\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput/AnVILBioCoreMinimal.tdr.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# Execution\u001b[39;00m\n\u001b[0;32m---> 76\u001b[0m \u001b[43mconvert_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[1], line 29\u001b[0m, in \u001b[0;36mconvert_schema\u001b[0;34m(input_file, output_file)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column, column_def \u001b[38;5;129;01min\u001b[39;00m properties\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Record foreign keys\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 29\u001b[0m final_column_name \u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m(^.*)_fk$\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m(\u001b[38;5;241m1\u001b[39m) \n\u001b[1;32m 30\u001b[0m target_table \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m$defs\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m/(.*)$\u001b[39m\u001b[38;5;124m\"\u001b[39m, column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 31\u001b[0m foreign_keys\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: table, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_column\u001b[39m\u001b[38;5;124m\"\u001b[39m: final_column_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: target_table})\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'" + ] + } + ], + "source": [ + "# Imports\n", + "import json\n", + "import re\n", + "\n", + "# Functions\n", + "def convert_schema(input_file, output_file):\n", + " # Read in input file\n", + " with open(input_file) as infile:\n", + " input_schema = json.load(infile)\n", + "\n", + " # Parse and convert input file\n", + " output_schema = {}\n", + " table_list = []\n", + " relationship_list = []\n", + " primary_keys = {}\n", + " foreign_keys = []\n", + " if input_schema.get(\"$defs\"):\n", + " # Loop through tables and build table objects\n", + " for table, table_def in input_schema[\"$defs\"].items():\n", + " required_fields = table_def.get(\"required\")\n", + " primary_keys[table] = required_fields[0]\n", + " properties = table_def.get(\"properties\")\n", + " column_list = []\n", + " if properties:\n", + " # Loop through columns and build column objects\n", + " for column, column_def in properties.items():\n", + " # Record foreign keys\n", + " if column_def.get(\"$ref\"):\n", + " final_column_name = re.search(r\"(^.*)_fk$\", column).group(1) \n", + " target_table = re.search(r\"\\$defs\\/(.*)$\", column_def.get(\"$ref\")).group(1)\n", + " foreign_keys.append({\"from_table\": table, \"from_column\": final_column_name, \"to_table\": target_table})\n", + " else:\n", + " final_column_name = column\n", + " # Build column object\n", + " array_of = False\n", + " required = True if column in required_fields else False\n", + " initial_data_type = column_def.get(\"type\")\n", + " if initial_data_type == \"array\":\n", + " final_data_type = column_def[\"items\"].get(\"type\")\n", + " array_of = True\n", + " elif initial_data_type:\n", + " final_data_type = initial_data_type\n", + " else:\n", + " final_data_type = \"string\"\n", + " column_dict = {\"name\": final_column_name, \"datatype\": final_data_type, \"array_of\": array_of, \"required\": required}\n", + " column_list.append(column_dict) \n", + " primary_key = required_fields if required_fields else []\n", + " table_dict = {\"name\": table, \"columns\": column_list, \"primaryKey\": primary_key}\n", + " table_list.append(table_dict)\n", + "\n", + " # Loop through recorded foreign keys and build relationship objects\n", + " for fk_entry in foreign_keys:\n", + " from_table = fk_entry[\"from_table\"]\n", + " from_column = fk_entry[\"from_column\"]\n", + " to_table = fk_entry[\"to_table\"]\n", + " to_column = primary_keys.get(to_table)\n", + " rel_name = from_table + \".\" + from_column + \"_to_\" + to_table + \".\" + to_column\n", + " relationship_dict = {\"name\": rel_name, \"from\": {\"table\": from_table, \"column\": from_column}, \"to\": {\"table\": to_table, \"column\": to_column}}\n", + " relationship_list.append(relationship_dict)\n", + "\n", + " # Add table and relationship objects to output schema\n", + " if table_list:\n", + " output_schema[\"tables\"] = table_list\n", + " if relationship_list:\n", + " output_schema[\"relationships\"] = relationship_list\n", + "\n", + " # Write out output file\n", + " with open(output_file, \"w\") as outfile:\n", + " json.dump(output_schema, outfile)\n", + "\n", + "# Parameters\n", + "file_path = \"/home/cox/git/biocore-data-model/content/\"\n", + "input_file = file_path + \"json_schema/AnVILBioCoreMinimal.schema.json\"\n", + "output_file = \"output/AnVILBioCoreMinimal.tdr.json\"\n", + "# Execution\n", + "convert_schema(input_file, output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1ea088c9-ef70-47ef-b1c2-d055df8da6ca", + "metadata": {}, + "outputs": [], + "source": [ + " with open(input_file) as infile:\n", + " input_schema = json.load(infile)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8e9ee64d-2fbc-424b-b93d-0c4b472f864a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'$defs': {'AnvilBioSample': {'additionalProperties': False,\n", + " 'description': 'Contains information about the sample(s) included in the study.',\n", + " 'properties': {'anatomical_site': {'description': 'A reference to the site within the organism from which the BioSample was taken.',\n", + " 'type': 'string'},\n", + " 'apriori_cell_type': {'description': 'A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'biosample_id': {'type': 'string'},\n", + " 'biosample_type': {'description': 'The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.',\n", + " 'type': 'string'},\n", + " 'disease': {'description': 'A property that identifies a disease or condition has been reported in this entity.',\n", + " 'type': 'string'},\n", + " 'donor_age_at_collection_lower_bound': {'description': 'Lower bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", + " 'type': 'number'},\n", + " 'donor_age_at_collection_unit': {'description': 'The units (e.g. years or days) of the Age of the Donor at the point in time that the BioSample was obtained or other representative entity (test, diagnosis, treatment...) was created.',\n", + " 'type': 'string'},\n", + " 'donor_age_at_collection_upper_bound': {'description': 'Upper bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", + " 'type': 'number'},\n", + " 'donor_id_fk': {'$ref': '#/$defs/AnvilDonor',\n", + " 'description': 'This property references the Donor organism from which the BioSample was acquired.'}},\n", + " 'required': ['biosample_id'],\n", + " 'title': 'AnvilBioSample',\n", + " 'type': 'object'},\n", + " 'AnvilDonor': {'additionalProperties': False,\n", + " 'description': 'Demographic and phenotypic information about the donor.',\n", + " 'properties': {'donor_id': {'type': 'string'},\n", + " 'genetic_ancestry': {'description': \"A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.\",\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'human_phenotype': {'$ref': '#/$defs/HumanPhenotypeOntology',\n", + " 'description': 'phenotype code from the human phenotype ontology (HPO)'},\n", + " 'organism_type': {'description': 'A reference to the organism type.',\n", + " 'type': 'string'},\n", + " 'phenotypic_sex': {'description': 'A reference to the BiologicalSex of the Donor organism. \\\\\"An organismal quality inhering in a bearer by virtue of the bearer\\'s physical expression of sexual characteristics. [PATO_0001894]\\\\',\n", + " 'type': 'string'},\n", + " 'reported_ethnicity': {'description': \"A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data\",\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'}},\n", + " 'required': ['donor_id'],\n", + " 'title': 'AnvilDonor',\n", + " 'type': 'object'},\n", + " 'AnvilFile': {'additionalProperties': False,\n", + " 'description': 'Information for files associated with the study.',\n", + " 'properties': {'data_modality': {'description': 'Data modality describes the biological nature of the information gathered as the result of an Activity, independent of the technology or methods used to produce the information.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'},\n", + " 'file_format': {'description': 'An indication of the format of an electronic file; include the full file extension including compression extensions.',\n", + " 'type': 'string'},\n", + " 'file_id': {'type': 'string'},\n", + " 'file_md5sum': {'description': 'md5 checksum for the file',\n", + " 'type': 'string'},\n", + " 'file_name': {'description': 'The name of the file.', 'type': 'string'},\n", + " 'file_ref': {'description': 'The fully qualified path to the file.',\n", + " 'type': 'string'},\n", + " 'file_size': {'description': 'Property that describes the approximate size of a file in megabytes.',\n", + " 'type': 'integer'},\n", + " 'reference_assembly': {'description': 'A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc.',\n", + " 'items': {'type': 'string'},\n", + " 'type': 'array'}},\n", + " 'required': ['file_id'],\n", + " 'title': 'AnvilFile',\n", + " 'type': 'object'},\n", + " 'HumanPhenotypeOntology': {'description': '',\n", + " 'enum': ['ALIVE', 'DEAD', 'UNKNOWN'],\n", + " 'title': 'HumanPhenotypeOntology',\n", + " 'type': 'string'}},\n", + " '$id': 'https://github.com/DataBiosphere/biocore-data-model/tree/main/content',\n", + " '$schema': 'https://json-schema.org/draft/2019-09/schema',\n", + " 'additionalProperties': True,\n", + " 'metamodel_version': '1.7.0',\n", + " 'title': 'AnVILBioCoreMinimal',\n", + " 'type': 'object',\n", + " 'version': None}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41dd7586-3d45-4edb-8843-1098c49a2ba8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}