Skip to content

Commit

Permalink
Added log report
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilAppleby committed Sep 14, 2023
1 parent a69b17b commit b424e92
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 13 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,5 @@ coconnect/cdm/classes/
scripts/*.json
temp
airflow*
*.json
.python-version
workflows
workflows
2 changes: 1 addition & 1 deletion carrot/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.6.85'
__version__ = '0.6.9'
33 changes: 25 additions & 8 deletions carrot/cli/subcommands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ def map(ctx,rules,inputs,format_level,
help="File containing person_ids in the first column")
@click.option("--omop-config",
required=True,
help="File containing json configfor omop outputs")
help="File containing json config for omop outputs")
@click.option("--saved-person-id-filename",
default='person_ids.tsv',
required=False,
Expand Down Expand Up @@ -664,22 +664,34 @@ def mapstream(rules, output_folder, write_mode, person_file, omop_config, saved_
tgtcolmaps[tgtfile] = omopcdm.get_omop_column_map(tgtfile)

except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
print("I/O - error({0}): {1} -> {2}".format(e.errno, e.strerror, str(e)))
exit()

print("person_id stats: total loaded {0}, reject count {1}".format(len(person_lookup), rejected_person_count))

input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
# TODO get this list of input files from the parsed rules
existing_input_files = fnmatch.filter(os.listdir(input_dir[0]), '*.csv')
rules_input_files = mappingrules.get_all_infile_names()
for infile in existing_input_files:
if infile not in rules_input_files:
msg = "ERROR: no mapping rules found for existing input file - {0}".format(infile)
print(msg)
metrics.add_log_data(msg)
for infile in rules_input_files:
if infile not in existing_input_files:
msg = "ERROR: no data for mapped input file - {0}".format(infile)
print(msg)
metrics.add_log_data(msg)
rejidcounts = {}
rejdatecounts = {}
src_tgt_counts = {}
print(rules_input_files)

for srcfilename in input_files:
for srcfilename in rules_input_files:
rejidcounts[srcfilename] = 0
rejdatecounts[srcfilename] = 0


for srcfilename in input_files:
for srcfilename in rules_input_files:
outcounts = {}
rejcounts = {}
rcount = 0
Expand All @@ -688,8 +700,9 @@ def mapstream(rules, output_folder, write_mode, person_file, omop_config, saved_
fh = open(input_dir[0] + "/" + srcfilename, mode='r')
csvr = csv.reader(fh)
except IOError as e:
print("Unable to open: {0}".format(input_dir[0] + "/" + srcfilename))
print("I/O error({0}): {1}".format(e.errno, e.strerror))
exit()
continue

tgtfiles, src_to_tgt = mappingrules.parse_rules_src_to_tgt(srcfilename)
infile_datetime_source, infile_person_id_source = mappingrules.get_infile_date_person_id(srcfilename)
Expand Down Expand Up @@ -767,13 +780,17 @@ def mapstream(rules, output_folder, write_mode, person_file, omop_config, saved_

print("--------------------------------------------------------------------------------")
data_summary = metrics.get_mapstream_summary()
log_report = metrics.get_log_data()
try:
dsfh = open(output_folder + "/summary_mapstream.tsv", mode="w")
dsfh.write(data_summary)
dsfh.close()
logfh = open(output_folder + "/error_report.txt", mode="w")
logfh.write(log_report)
logfh.close()
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
print("Unable to write summary.tsv")
print("Unable to write file")

nowtime = time.time()
print("Elapsed time = {0:.5f} secs".format(nowtime - starttime))
Expand Down
96 changes: 96 additions & 0 deletions carrot/data/config/omop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{
"all_columns": {
"condition_occurrence": ["condition_occurrence_id", "person_id",
"condition_concept_id", "condition_start_date",
"condition_start_datetime", "condition_end_date",
"condition_end_datetime", "condition_type_concept_id",
"stop_reason", "provider_id", "visit_occurrence_id",
"condition_source_value", "condition_source_concept_id",
"condition_status_source_value", "condition_status_concept_id"],
"death": ["person_id", "death_date", "death_datetime", "death_type_concept_id",
"cause_concept_id", "cause_source_value", "cause_source_concept_id"],
"drug_exposure": ["drug_exposure_id", "person_id", "drug_concept_id",
"drug_exposure_start_date", "drug_exposure_start_datetime",
"drug_exposure_end_date", "drug_exposure_end_datetime", "verbatim_end_date",
"drug_type_concept_id", "stop_reason",
"refills", "quantity", "days_supply", "sig", "route_concept_id",
"lot_number", "provider_id", "visit_occurrence_id",
"drug_source_value", "drug_source_concept_id", "route_source_value",
"dose_unit_source_value"],
"measurement": ["measurement_id", "person_id", "measurement_concept_id",
"measurement_date", "measurement_datetime", "measurement_type_concept_id",
"operator_concept_id", "value_as_number", "value_as_concept_id",
"unit_concept_id", "range_low", "range_high", "provider_id",
"visit_occurrence_id", "measurement_source_value", "measurement_source_concept_id",
"unit_source_value", "value_source_value"],
"observation": ["observation_id", "person_id", "observation_concept_id", "observation_date",
"observation_datetime", "observation_type_concept_id", "value_as_number",
"value_as_string", "value_as_concept_id", "qualifier_concept_id",
"unit_concept_id provider_id", "visit_occurrence_id", "observation_source_value",
"observation_source_concept_id", "unit_source_value", "qualifier_source_value"],
"person": ["person_id", "gender_concept_id", "year_of_birth", "month_of_birth",
"day_of_birth", "birth_datetime", "race_concept_id", "ethnicity_concept_id",
"location_id", "provider_id", "care_site_id", "person_source_value",
"gender_source_value", "gender_source_concept_id",
"race_source_value", "race_source_concept_id",
"ethnicity_source_value", "ethnicity_source_concept_id"],
"procedure_occurrence": ["procedure_occurrence_id", "person_id", "procedure_concept_id",
"procedure_date", "procedure_datetime",
"procedure_type_concept_id", "modifier_concept_id",
"quantity", "provider_id", "visit_occurrence_id",
"procedure_source_value", "procedure_source_concept_id", "qualifier_source_value"],
"specimen": ["specimen_id", "person_id", "specimen_concept_id", "specimen_type_concept_id",
"specimen_date", "specimen_datetime", "quantity", "unit_concept_id",
"anatomic_site_concept_id", "disease_status_concept_id", "specimen_source_id", "specimen_source_value",
"unit_source_value", "anatomic_site_source_value", "disease_status_source_value"],
"visit_occurrence": ["visit_occurrence_id", "person_id", "visit_concept_id", "visit_start_date", "visit_start_datetime",
"visit_end_date", "visit_end_datetime", "visit_type_concept_id", "provider_id", "care_site_id",
"visit_source_value", "visit_source_concept_id", "admitting_source_concept_id", "admitting_source_value",
"discharge_to_concept_id", "discharge_to_source_value", "preceding_visit_occurrence_id"]
},
"date_fields": {
"condition_occurrence": {"condition_start_datetime": "condition_start_date", "condition_end_datetime": "condition_end_date"},
"death": {"death_datetime": "death_date"},
"drug_exposure": {"drug_exposure_start_datetime": "drug_exposure_start_date", "drug_exposure_end_datetime": "drug_exposure_end_date"},
"measurement": {"measurement_datetime": "measurement_date"},
"observation": {"observation_datetime": "observation_date"},
"procedure_occurrence": {"procedure_datetime": "procedure_date"},
"specimen": {"specimen_datetime": "specimen_date"},
"visit_occurrence": {"visit_start_datetime": "visit_start_date", "visit_end_datetime": "visit_end_date"}
},
"date_field_components": {
"person": {"birth_datetime": {"year":"year_of_birth", "month":"month_of_birth", "day":"day_of_birth"}}
},
"datetime_fields": {
"condition_occurrence": ["condition_start_datetime", "condition_end_datetime"],
"death": ["death_datetime"],
"drug_exposure": ["drug_exposure_start_datetime", "drug_exposure_end_datetime"],
"measurement": ["measurement_datetime"],
"observation": ["observation_datetime"],
"person": ["birth_datetime"],
"procedure_occurrence": ["procedure_datetime"],
"specimen": ["specimen_datetime"],
"visit_occurrence": ["visit_start_datetime", "visit_end_datetime"]
},
"person_id_field": {
"condition_occurrence": "person_id",
"death": "person_id",
"drug_exposure": "person_id",
"measurement": "person_id",
"observation": "person_id",
"person": "person_id",
"procedure_occurrence": "person_id",
"specimen": "person_id",
"visit_occurrence": "person_id"
},
"auto_number_field": {
"condition_occurrence": "condition_occurrence_id",
"death": "death_id",
"drug_exposure": "drug_exposure_id",
"measurement": "measurement_id",
"observation": "observation_id",
"procedure_occurrence": "procedure_occurrence_id",
"specimen": "specimen_id",
"visit_occurrence": "visit_occurrence_id"
}
}
13 changes: 13 additions & 0 deletions carrot/tools/mappingrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,24 @@ def load_json(self, f_in):

def get_all_outfile_names(self):
file_list = []

for outfilename in self.rules_data["cdm"]:
file_list.append(outfilename)

return file_list

def get_all_infile_names(self):
file_list = []

for outfilename, conditions in self.rules_data["cdm"].items():
for outfield, source_field in conditions.items():
for source_field_name, source_data in source_field.items():
if "source_table" in source_data:
if source_data["source_table"] not in file_list:
file_list.append(source_data["source_table"])

return file_list

def get_infile_data_fields(self, infilename):
data_fields_lists = {}

Expand Down
8 changes: 8 additions & 0 deletions carrot/tools/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class Metrics():
def __init__(self):
self.datasummary={}
self.allcounts={}
self.log_data=""

def get_new_mapstream_counts(self):
counts = {}
Expand Down Expand Up @@ -104,3 +105,10 @@ def get_mapstream_summary(self):
summary_str += source + "\t" + fieldname + "\t" + tablename + "\t" + input_count + "\t" + invalid_person_ids + "\t" + invalid_date_fields + "\t" + invalid_source_fields + "\t" + output_count + "\n"

return summary_str

def add_log_data(self, msg):
self.log_data += msg + "\n"

def get_log_data(self):
return self.log_data

5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@


setuptools.setup(
#name="carrot-cdm-beta",
name="carrot-cdm",
author="CO-CONNECT Collaboration",
version=version,
author_email="calmacx@gmail.com",
author_email="pdappleby@gmail.com",
description="Python package for performing mapping of ETL to CDM ",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down Expand Up @@ -53,7 +54,7 @@
"python-dotenv",
"co-connect-pseudonymise"
],
package_data={'carrot': ['data/cdm/*','data/example/*/*','data/test/*/*','data/test/*/*/*']},
package_data={'carrot': ['data/config/*', 'data/cdm/*','data/example/*/*','data/test/*/*','data/test/*/*/*']},
include_package_data=True,
classifiers=[
"Programming Language :: Python :: 3",
Expand Down

0 comments on commit b424e92

Please sign in to comment.