-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update minimal anvil and tdr conversion script
- Loading branch information
Showing
4 changed files
with
502 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
236 changes: 236 additions & 0 deletions
236
content/tools/python/.ipynb_checkpoints/convert_to_tdr_schema-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "f87a430c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "AttributeError", | ||
"evalue": "'NoneType' object has no attribute 'group'", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[1], line 76\u001b[0m\n\u001b[1;32m 74\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput/AnVILBioCoreMinimal.tdr.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# Execution\u001b[39;00m\n\u001b[0;32m---> 76\u001b[0m \u001b[43mconvert_schema\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n", | ||
"Cell \u001b[0;32mIn[1], line 29\u001b[0m, in \u001b[0;36mconvert_schema\u001b[0;34m(input_file, output_file)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column, column_def \u001b[38;5;129;01min\u001b[39;00m properties\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Record foreign keys\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 29\u001b[0m final_column_name \u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m(^.*)_fk$\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m(\u001b[38;5;241m1\u001b[39m) \n\u001b[1;32m 30\u001b[0m target_table \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m$defs\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m/(.*)$\u001b[39m\u001b[38;5;124m\"\u001b[39m, column_def\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m$ref\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 31\u001b[0m foreign_keys\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: table, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_column\u001b[39m\u001b[38;5;124m\"\u001b[39m: final_column_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: target_table})\n", | ||
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Imports\n", | ||
"import json\n", | ||
"import re\n", | ||
"\n", | ||
"# Functions\n", | ||
"def convert_schema(input_file, output_file):\n", | ||
" # Read in input file\n", | ||
" with open(input_file) as infile:\n", | ||
" input_schema = json.load(infile)\n", | ||
"\n", | ||
" # Parse and convert input file\n", | ||
" output_schema = {}\n", | ||
" table_list = []\n", | ||
" relationship_list = []\n", | ||
" primary_keys = {}\n", | ||
" foreign_keys = []\n", | ||
" if input_schema.get(\"$defs\"):\n", | ||
" # Loop through tables and build table objects\n", | ||
" for table, table_def in input_schema[\"$defs\"].items():\n", | ||
" required_fields = table_def.get(\"required\")\n", | ||
" primary_keys[table] = required_fields[0]\n", | ||
" properties = table_def.get(\"properties\")\n", | ||
" column_list = []\n", | ||
" if properties:\n", | ||
" # Loop through columns and build column objects\n", | ||
" for column, column_def in properties.items():\n", | ||
" # Record foreign keys\n", | ||
" if column_def.get(\"$ref\"):\n", | ||
" final_column_name = re.search(r\"(^.*)_fk$\", column).group(1) \n", | ||
" target_table = re.search(r\"\\$defs\\/(.*)$\", column_def.get(\"$ref\")).group(1)\n", | ||
" foreign_keys.append({\"from_table\": table, \"from_column\": final_column_name, \"to_table\": target_table})\n", | ||
" else:\n", | ||
" final_column_name = column\n", | ||
" # Build column object\n", | ||
" array_of = False\n", | ||
" required = True if column in required_fields else False\n", | ||
" initial_data_type = column_def.get(\"type\")\n", | ||
" if initial_data_type == \"array\":\n", | ||
" final_data_type = column_def[\"items\"].get(\"type\")\n", | ||
" array_of = True\n", | ||
" elif initial_data_type:\n", | ||
" final_data_type = initial_data_type\n", | ||
" else:\n", | ||
" final_data_type = \"string\"\n", | ||
" column_dict = {\"name\": final_column_name, \"datatype\": final_data_type, \"array_of\": array_of, \"required\": required}\n", | ||
" column_list.append(column_dict) \n", | ||
" primary_key = required_fields if required_fields else []\n", | ||
" table_dict = {\"name\": table, \"columns\": column_list, \"primaryKey\": primary_key}\n", | ||
" table_list.append(table_dict)\n", | ||
"\n", | ||
" # Loop through recorded foreign keys and build relationship objects\n", | ||
" for fk_entry in foreign_keys:\n", | ||
" from_table = fk_entry[\"from_table\"]\n", | ||
" from_column = fk_entry[\"from_column\"]\n", | ||
" to_table = fk_entry[\"to_table\"]\n", | ||
" to_column = primary_keys.get(to_table)\n", | ||
" rel_name = from_table + \".\" + from_column + \"_to_\" + to_table + \".\" + to_column\n", | ||
" relationship_dict = {\"name\": rel_name, \"from\": {\"table\": from_table, \"column\": from_column}, \"to\": {\"table\": to_table, \"column\": to_column}}\n", | ||
" relationship_list.append(relationship_dict)\n", | ||
"\n", | ||
" # Add table and relationship objects to output schema\n", | ||
" if table_list:\n", | ||
" output_schema[\"tables\"] = table_list\n", | ||
" if relationship_list:\n", | ||
" output_schema[\"relationships\"] = relationship_list\n", | ||
"\n", | ||
" # Write out output file\n", | ||
" with open(output_file, \"w\") as outfile:\n", | ||
" json.dump(output_schema, outfile)\n", | ||
"\n", | ||
"# Parameters\n", | ||
"file_path = \"/home/cox/git/biocore-data-model/content/\"\n", | ||
"input_file = file_path + \"json_schema/AnVILBioCoreMinimal.schema.json\"\n", | ||
"output_file = \"output/AnVILBioCoreMinimal.tdr.json\"\n", | ||
"# Execution\n", | ||
"convert_schema(input_file, output_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "1ea088c9-ef70-47ef-b1c2-d055df8da6ca", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
" with open(input_file) as infile:\n", | ||
" input_schema = json.load(infile)\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "8e9ee64d-2fbc-424b-b93d-0c4b472f864a", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"{'$defs': {'AnvilBioSample': {'additionalProperties': False,\n", | ||
" 'description': 'Contains information about the sample(s) included in the study.',\n", | ||
" 'properties': {'anatomical_site': {'description': 'A reference to the site within the organism from which the BioSample was taken.',\n", | ||
" 'type': 'string'},\n", | ||
" 'apriori_cell_type': {'description': 'A priori cell type(s) for the sample, a human assignment of cell type. This should be used when the cell type is known, but not necessarily confirmed by primary experimental data.',\n", | ||
" 'items': {'type': 'string'},\n", | ||
" 'type': 'array'},\n", | ||
" 'biosample_id': {'type': 'string'},\n", | ||
" 'biosample_type': {'description': 'The type of biosample represented by the record. This is a controlled vocabulary term from BioCoreTerms.',\n", | ||
" 'type': 'string'},\n", | ||
" 'disease': {'description': 'A property that identifies a disease or condition has been reported in this entity.',\n", | ||
" 'type': 'string'},\n", | ||
" 'donor_age_at_collection_lower_bound': {'description': 'Lower bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", | ||
" 'type': 'number'},\n", | ||
" 'donor_age_at_collection_unit': {'description': 'The units (e.g. years or days) of the Age of the Donor at the point in time that the BioSample was obtained or other representative entity (test, diagnosis, treatment...) was created.',\n", | ||
" 'type': 'string'},\n", | ||
" 'donor_age_at_collection_upper_bound': {'description': 'Upper bound for age of donor at time sample was taken. If any age at collection data is present, must specify a unit as well.',\n", | ||
" 'type': 'number'},\n", | ||
" 'donor_id_fk': {'$ref': '#/$defs/AnvilDonor',\n", | ||
" 'description': 'This property references the Donor organism from which the BioSample was acquired.'}},\n", | ||
" 'required': ['biosample_id'],\n", | ||
" 'title': 'AnvilBioSample',\n", | ||
" 'type': 'object'},\n", | ||
" 'AnvilDonor': {'additionalProperties': False,\n", | ||
" 'description': 'Demographic and phenotypic information about the donor.',\n", | ||
" 'properties': {'donor_id': {'type': 'string'},\n", | ||
" 'genetic_ancestry': {'description': \"A property that relects a HumanDonor's reported major contributing ancestral origins based on genetic/genomic data. If you are not sure your values are calculated from the genomic data, use *reported_ethnicity* instead.\",\n", | ||
" 'items': {'type': 'string'},\n", | ||
" 'type': 'array'},\n", | ||
" 'human_phenotype': {'$ref': '#/$defs/HumanPhenotypeOntology',\n", | ||
" 'description': 'phenotype code from the human phenotype ontology (HPO)'},\n", | ||
" 'organism_type': {'description': 'A reference to the organism type.',\n", | ||
" 'type': 'string'},\n", | ||
" 'phenotypic_sex': {'description': 'A reference to the BiologicalSex of the Donor organism. \\\\\"An organismal quality inhering in a bearer by virtue of the bearer\\'s physical expression of sexual characteristics. [PATO_0001894]\\\\',\n", | ||
" 'type': 'string'},\n", | ||
" 'reported_ethnicity': {'description': \"A property that relects a Human Donor's reported ethnic origins. Note this may contain both Race and Ethnicity information as define by the US Department of Interior (DOI) https://www.doi.gov/pmb/eeo/directives/race-data\",\n", | ||
" 'items': {'type': 'string'},\n", | ||
" 'type': 'array'}},\n", | ||
" 'required': ['donor_id'],\n", | ||
" 'title': 'AnvilDonor',\n", | ||
" 'type': 'object'},\n", | ||
" 'AnvilFile': {'additionalProperties': False,\n", | ||
" 'description': 'Information for files associated with the study.',\n", | ||
" 'properties': {'data_modality': {'description': 'Data modality describes the biological nature of the information gathered as the result of an Activity, independent of the technology or methods used to produce the information.',\n", | ||
" 'items': {'type': 'string'},\n", | ||
" 'type': 'array'},\n", | ||
" 'file_format': {'description': 'An indication of the format of an electronic file; include the full file extension including compression extensions.',\n", | ||
" 'type': 'string'},\n", | ||
" 'file_id': {'type': 'string'},\n", | ||
" 'file_md5sum': {'description': 'md5 checksum for the file',\n", | ||
" 'type': 'string'},\n", | ||
" 'file_name': {'description': 'The name of the file.', 'type': 'string'},\n", | ||
" 'file_ref': {'description': 'The fully qualified path to the file.',\n", | ||
" 'type': 'string'},\n", | ||
" 'file_size': {'description': 'Property that describes the approximate size of a file in megabytes.',\n", | ||
" 'type': 'integer'},\n", | ||
" 'reference_assembly': {'description': 'A reference to the collection of sequences taken as the standard for a given organism. May be defined by https://www.ncbi.nlm.nih.gov/grc.',\n", | ||
" 'items': {'type': 'string'},\n", | ||
" 'type': 'array'}},\n", | ||
" 'required': ['file_id'],\n", | ||
" 'title': 'AnvilFile',\n", | ||
" 'type': 'object'},\n", | ||
" 'HumanPhenotypeOntology': {'description': '',\n", | ||
" 'enum': ['ALIVE', 'DEAD', 'UNKNOWN'],\n", | ||
" 'title': 'HumanPhenotypeOntology',\n", | ||
" 'type': 'string'}},\n", | ||
" '$id': 'https://github.com/DataBiosphere/biocore-data-model/tree/main/content',\n", | ||
" '$schema': 'https://json-schema.org/draft/2019-09/schema',\n", | ||
" 'additionalProperties': True,\n", | ||
" 'metamodel_version': '1.7.0',\n", | ||
" 'title': 'AnVILBioCoreMinimal',\n", | ||
" 'type': 'object',\n", | ||
" 'version': None}" | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"input_schema" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "41dd7586-3d45-4edb-8843-1098c49a2ba8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.