Skip to content

Commit

Permalink
update minimal anvil and tdr conversion script
Browse files Browse the repository at this point in the history
  • Loading branch information
rsc3 committed Apr 16, 2024
1 parent a23bbfa commit e62eb14
Show file tree
Hide file tree
Showing 4 changed files with 502 additions and 5 deletions.
22 changes: 18 additions & 4 deletions content/json_schema/AnVILBioCoreMinimal.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"type": "string"
},
"apriori_cell_type": {
"description": "A priori cell type(s) for the sample, a human assignment of cell type.",
"description": "A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.",
"items": {
"type": "string"
},
Expand All @@ -19,7 +19,7 @@
"type": "string"
},
"biosample_type": {
"description": "The type of biosample represented by the record.",
"description": "The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.",
"type": "string"
},
"disease": {
Expand Down Expand Up @@ -57,12 +57,16 @@
"type": "string"
},
"genetic_ancestry": {
"description": "A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data.",
"description": "A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.",
"items": {
"type": "string"
},
"type": "array"
},
"human_phenotype": {
"$ref": "#/$defs/HumanPhenotypeOntology",
"description": "phenotype code from the human phenotype ontology (HPO)"
},
"organism_type": {
"description": "A reference to the organism type.",
"type": "string"
Expand All @@ -72,7 +76,7 @@
"type": "string"
},
"reported_ethnicity": {
"description": "A property that relects a Human Donor's reported ethnic origins.",
"description": "A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data",
"items": {
"type": "string"
},
Expand Down Expand Up @@ -132,6 +136,16 @@
],
"title": "AnvilFile",
"type": "object"
},
"HumanPhenotypeOntology": {
"description": "",
"enum": [
"ALIVE",
"DEAD",
"UNKNOWN"
],
"title": "HumanPhenotypeOntology",
"type": "string"
}
},
"$id": "https://github.com/DataBiosphere/biocore-data-model/tree/main/content",
Expand Down
13 changes: 12 additions & 1 deletion content/linkml/AnVILBioCoreMinimal.linkml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ id: https://github.com/DataBiosphere/biocore-data-model/tree/main/content
prefixes:
linkml: https://w3id.org/linkml/
anvil: https://anvilproject.org/
hpo: https://hpo.jax.org/app/browse/term/
default_prefix: anvil
imports:
- linkml:types # this imports the linkml types schema
Expand Down Expand Up @@ -33,6 +34,7 @@ classes:
- phenotypic_sex
- reported_ethnicity
- genetic_ancestry
- human_phenotype
AnvilFile:
description: Information for files associated with the study.
slots:
Expand Down Expand Up @@ -137,4 +139,13 @@ slots:
reference_assembly:
aliases: usesReferenceAssembly
description: A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc.
multivalued: true
multivalued: true

human_phenotype:
description: phenotype code from the human phenotype ontology (HPO)
range: HumanPhenotypeOntology

enums:
HumanPhenotypeOntology:
permissible_values:
All:
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f87a430c",
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'group'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 76\u001b[0m\n\u001b[1;32m 74\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput/AnVILBioCoreMinimal.tdr.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# Execution\u001b[39;00m\n\u001b[0;32m---> 76\u001b[0m \u001b[43mconvert_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[1], line 29\u001b[0m, in \u001b[0;36mconvert_schema\u001b[0;34m(input_file, output_file)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column, column_def \u001b[38;5;129;01min\u001b[39;00m properties\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Record foreign keys\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 29\u001b[0m final_column_name \u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m(^.*)_fk$\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m(\u001b[38;5;241m1\u001b[39m) \n\u001b[1;32m 30\u001b[0m target_table \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m$defs\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m/(.*)$\u001b[39m\u001b[38;5;124m\"\u001b[39m, column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 31\u001b[0m foreign_keys\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: table, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_column\u001b[39m\u001b[38;5;124m\"\u001b[39m: final_column_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: target_table})\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'"
]
}
],
"source": [
"# Imports\n",
"import json\n",
"import re\n",
"\n",
"# Functions\n",
"def convert_schema(input_file, output_file):\n",
" # Read in input file\n",
" with open(input_file) as infile:\n",
" input_schema = json.load(infile)\n",
"\n",
" # Parse and convert input file\n",
" output_schema = {}\n",
" table_list = []\n",
" relationship_list = []\n",
" primary_keys = {}\n",
" foreign_keys = []\n",
" if input_schema.get(\"$defs\"):\n",
" # Loop through tables and build table objects\n",
" for table, table_def in input_schema[\"$defs\"].items():\n",
" required_fields = table_def.get(\"required\")\n",
" primary_keys[table] = required_fields[0]\n",
" properties = table_def.get(\"properties\")\n",
" column_list = []\n",
" if properties:\n",
" # Loop through columns and build column objects\n",
" for column, column_def in properties.items():\n",
" # Record foreign keys\n",
" if column_def.get(\"$ref\"):\n",
" final_column_name = re.search(r\"(^.*)_fk$\", column).group(1) \n",
" target_table = re.search(r\"\\$defs\\/(.*)$\", column_def.get(\"$ref\")).group(1)\n",
" foreign_keys.append({\"from_table\": table, \"from_column\": final_column_name, \"to_table\": target_table})\n",
" else:\n",
" final_column_name = column\n",
" # Build column object\n",
" array_of = False\n",
" required = True if column in required_fields else False\n",
" initial_data_type = column_def.get(\"type\")\n",
" if initial_data_type == \"array\":\n",
" final_data_type = column_def[\"items\"].get(\"type\")\n",
" array_of = True\n",
" elif initial_data_type:\n",
" final_data_type = initial_data_type\n",
" else:\n",
" final_data_type = \"string\"\n",
" column_dict = {\"name\": final_column_name, \"datatype\": final_data_type, \"array_of\": array_of, \"required\": required}\n",
" column_list.append(column_dict) \n",
" primary_key = required_fields if required_fields else []\n",
" table_dict = {\"name\": table, \"columns\": column_list, \"primaryKey\": primary_key}\n",
" table_list.append(table_dict)\n",
"\n",
" # Loop through recorded foreign keys and build relationship objects\n",
" for fk_entry in foreign_keys:\n",
" from_table = fk_entry[\"from_table\"]\n",
" from_column = fk_entry[\"from_column\"]\n",
" to_table = fk_entry[\"to_table\"]\n",
" to_column = primary_keys.get(to_table)\n",
" rel_name = from_table + \".\" + from_column + \"_to_\" + to_table + \".\" + to_column\n",
" relationship_dict = {\"name\": rel_name, \"from\": {\"table\": from_table, \"column\": from_column}, \"to\": {\"table\": to_table, \"column\": to_column}}\n",
" relationship_list.append(relationship_dict)\n",
"\n",
" # Add table and relationship objects to output schema\n",
" if table_list:\n",
" output_schema[\"tables\"] = table_list\n",
" if relationship_list:\n",
" output_schema[\"relationships\"] = relationship_list\n",
"\n",
" # Write out output file\n",
" with open(output_file, \"w\") as outfile:\n",
" json.dump(output_schema, outfile)\n",
"\n",
"# Parameters\n",
"file_path = \"/home/cox/git/biocore-data-model/content/\"\n",
"input_file = file_path + \"json_schema/AnVILBioCoreMinimal.schema.json\"\n",
"output_file = \"output/AnVILBioCoreMinimal.tdr.json\"\n",
"# Execution\n",
"convert_schema(input_file, output_file)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1ea088c9-ef70-47ef-b1c2-d055df8da6ca",
"metadata": {},
"outputs": [],
"source": [
" with open(input_file) as infile:\n",
" input_schema = json.load(infile)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8e9ee64d-2fbc-424b-b93d-0c4b472f864a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'$defs': {'AnvilBioSample': {'additionalProperties': False,\n",
" 'description': 'Contains information about the sample(s) included in the study.',\n",
" 'properties': {'anatomical_site': {'description': 'A reference to the site within the organism from which the BioSample was taken.',\n",
" 'type': 'string'},\n",
" 'apriori_cell_type': {'description': 'A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.',\n",
" 'items': {'type': 'string'},\n",
" 'type': 'array'},\n",
" 'biosample_id': {'type': 'string'},\n",
" 'biosample_type': {'description': 'The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.',\n",
" 'type': 'string'},\n",
" 'disease': {'description': 'A property that identifies a disease or condition has been reported in this entity.',\n",
" 'type': 'string'},\n",
" 'donor_age_at_collection_lower_bound': {'description': 'Lower bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n",
" 'type': 'number'},\n",
" 'donor_age_at_collection_unit': {'description': 'The units (e.g. years or days) of the Age of the Donor at the point in time that the BioSample was obtained or other representative entity (test, diagnosis, treatment...) was created.',\n",
" 'type': 'string'},\n",
" 'donor_age_at_collection_upper_bound': {'description': 'Upper bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n",
" 'type': 'number'},\n",
" 'donor_id_fk': {'$ref': '#/$defs/AnvilDonor',\n",
" 'description': 'This property references the Donor organism from which the BioSample was acquired.'}},\n",
" 'required': ['biosample_id'],\n",
" 'title': 'AnvilBioSample',\n",
" 'type': 'object'},\n",
" 'AnvilDonor': {'additionalProperties': False,\n",
" 'description': 'Demographic and phenotypic information about the donor.',\n",
" 'properties': {'donor_id': {'type': 'string'},\n",
" 'genetic_ancestry': {'description': \"A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.\",\n",
" 'items': {'type': 'string'},\n",
" 'type': 'array'},\n",
" 'human_phenotype': {'$ref': '#/$defs/HumanPhenotypeOntology',\n",
" 'description': 'phenotype code from the human phenotype ontology (HPO)'},\n",
" 'organism_type': {'description': 'A reference to the organism type.',\n",
" 'type': 'string'},\n",
" 'phenotypic_sex': {'description': 'A reference to the BiologicalSex of the Donor organism. \\\\\"An organismal quality inhering in a bearer by virtue of the bearer\\'s physical expression of sexual characteristics. [PATO_0001894]\\\\',\n",
" 'type': 'string'},\n",
" 'reported_ethnicity': {'description': \"A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data\",\n",
" 'items': {'type': 'string'},\n",
" 'type': 'array'}},\n",
" 'required': ['donor_id'],\n",
" 'title': 'AnvilDonor',\n",
" 'type': 'object'},\n",
" 'AnvilFile': {'additionalProperties': False,\n",
" 'description': 'Information for files associated with the study.',\n",
" 'properties': {'data_modality': {'description': 'Data modality describes the biological nature of the information gathered as the result of an Activity, independent of the technology or methods used to produce the information.',\n",
" 'items': {'type': 'string'},\n",
" 'type': 'array'},\n",
" 'file_format': {'description': 'An indication of the format of an electronic file; include the full file extension including compression extensions.',\n",
" 'type': 'string'},\n",
" 'file_id': {'type': 'string'},\n",
" 'file_md5sum': {'description': 'md5 checksum for the file',\n",
" 'type': 'string'},\n",
" 'file_name': {'description': 'The name of the file.', 'type': 'string'},\n",
" 'file_ref': {'description': 'The fully qualified path to the file.',\n",
" 'type': 'string'},\n",
" 'file_size': {'description': 'Property that describes the approximate size of a file in megabytes.',\n",
" 'type': 'integer'},\n",
" 'reference_assembly': {'description': 'A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc.',\n",
" 'items': {'type': 'string'},\n",
" 'type': 'array'}},\n",
" 'required': ['file_id'],\n",
" 'title': 'AnvilFile',\n",
" 'type': 'object'},\n",
" 'HumanPhenotypeOntology': {'description': '',\n",
" 'enum': ['ALIVE', 'DEAD', 'UNKNOWN'],\n",
" 'title': 'HumanPhenotypeOntology',\n",
" 'type': 'string'}},\n",
" '$id': 'https://github.com/DataBiosphere/biocore-data-model/tree/main/content',\n",
" '$schema': 'https://json-schema.org/draft/2019-09/schema',\n",
" 'additionalProperties': True,\n",
" 'metamodel_version': '1.7.0',\n",
" 'title': 'AnVILBioCoreMinimal',\n",
" 'type': 'object',\n",
" 'version': None}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_schema"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41dd7586-3d45-4edb-8843-1098c49a2ba8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit e62eb14

Please sign in to comment.