Skip to content

Commit

Permalink
Fixing merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
yolaj-nhs committed Feb 6, 2024
1 parent 501a730 commit 87d4749
Showing 1 changed file with 0 additions and 186 deletions.
186 changes: 0 additions & 186 deletions aisdc/preprocessing/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,189 +657,3 @@ def hospital_days(row):
df[col] = encoder.fit_transform(df[col].values)

Check warning on line 657 in aisdc/preprocessing/loaders.py

View check run for this annotation

Codecov / codecov/patch

aisdc/preprocessing/loaders.py#L654-L657

Added lines #L654 - L657 were not covered by tests

return (df, labels)

Check warning on line 659 in aisdc/preprocessing/loaders.py

View check run for this annotation

Codecov / codecov/patch

aisdc/preprocessing/loaders.py#L659

Added line #L659 was not covered by tests
=======
def _texas_hospitals(
data_folder: str,
) -> Tuple[pd.DataFrame, pd.DataFrame]: # pragma: no cover
# pylint: disable=too-many-statements, too-many-locals
"""
Texas Hospitals Dataset
https://www.dshs.texas.gov/texas-health-care-information-collection/health-data-researcher-information/texas-inpatient-public-use # pylint: disable=line-too-long.
Download the tab-delimited files for each quarter from
2006, 2007, 2008 and 2009.
Note: This data is free to download.
"""
file_list = [
"PUDF 1Q2006 tab-delimited.zip",
"PUDF 1Q2007 tab-delimited.zip",
"PUDF 1Q2009 tab-delimited.zip",
"PUDF 2Q2006 tab-delimited.zip",
"PUDF 2Q2007 tab-delimited.zip",
"PUDF 2Q2009 tab-delimited.zip",
"PUDF 3Q2006 tab-delimited.zip",
"PUDF 3Q2007 tab-delimited.zip",
"PUDF 4Q2006 tab-delimited.zip",
"PUDF 4Q2007 tab-delimited.zip",
"PUDF 4Q2009 tab-delimited.zip",
"PUDF1Q08_update_tab.zip",
"PUDF2Q08_update_tab.zip",
"PUDF3Q08_update_tab.zip",
]
files_path = [os.path.join(data_folder, f) for f in file_list]

found = [os.path.exists(file_path) for file_path in files_path]
not_found = [file_path for file_path in files_path if not os.path.exists(file_path)]

processed_data_file = "texas_data10_rm_binary.csv"
if not all(found):
help_message = f"""
Some or all data files do not exist. Please accept their terms & conditions,then download the
tab delimited files from each quarter during 2006-2009 from:
https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm
and place it in the correct folder.
Missing files are:
{not_found}
"""
raise DataNotAvailable(help_message)

if not os.path.exists(
os.path.join(data_folder, "TexasHospitals", processed_data_file)
):
logger.info("Processing Texas Hospitals data (2006-2009)")

# Load data
columns_names = [
"THCIC_ID", # Provider ID. Unique identifier assigned to the provider by DSHS.
# Hospitals with fewer than 50 discharges have been aggregated into the
# Provider ID '999999'
"DISCHARGE_QTR", # yyyyQm
"TYPE_OF_ADMISSION",
"SOURCE_OF_ADMISSION",
"PAT_ZIP", # Patient’s five-digit ZIP code
"PUBLIC_HEALTH_REGION", # Public Health Region of patient’s address
"PAT_STATUS", # Code indicating patient status as of the ending date of service for
# the period of care reported
"SEX_CODE",
"RACE",
"ETHNICITY",
"LENGTH_OF_STAY",
"PAT_AGE", # Code indicating age of patient in days or years on date of discharge.
"PRINC_DIAG_CODE", # diagnosis code for the principal diagnosis
"E_CODE_1", # external cause of injury
"PRINC_SURG_PROC_CODE", # Code for the principal surgical or other procedure performed
# during the period covered by the bill
"RISK_MORTALITY", # Assignment of a risk of mortality score from the All Patient
# Refined (APR) Diagnosis Related Group (DRG)
"ILLNESS_SEVERITY", # Assignment of a severity of illness score from the All Patient
# Refined (APR) Diagnosis RelatedGroup (DRG
"RECORD_ID",
]
# obtain the 100 most frequent procedures
tmp = []
for f in files_path:
df = [
pd.read_csv(
ZipFile(f).open(i), sep="\t", usecols=["PRINC_SURG_PROC_CODE"]
)
for i in ZipFile(f).namelist()
if "base" in i
]
if len(df) < 1:
print(f"WARNING: {f} could not be loaded.")
else:
df[0].dropna(inplace=True)
tmp.extend(list(df[0].PRINC_SURG_PROC_CODE))
princ_surg_proc_keep = [k for k, v in Counter(tmp).most_common(10)]
# remove unnecessary variables
del tmp

# Load the data
tx_data = pd.DataFrame()
for f in files_path:
df = [
pd.read_csv(ZipFile(f).open(i), sep="\t", usecols=columns_names)
for i in ZipFile(f).namelist()
if "base" in i
][0]
# keep only those rows with one of the 10 most common principal surgical procedure
df = df[df["PRINC_SURG_PROC_CODE"].isin(princ_surg_proc_keep)]
# clean up data
df.dropna(inplace=True)
df.replace("`", pd.NA, inplace=True)
df.replace("*", pd.NA, inplace=True)
# replace sex to numeric
df.SEX_CODE.replace("M", 0, inplace=True)
df.SEX_CODE.replace("F", 1, inplace=True)
df.SEX_CODE.replace("U", 2, inplace=True)
# set to numerical variable
for d_code in set(list(df.DISCHARGE_QTR)):
df.DISCHARGE_QTR.replace(
d_code, "".join(d_code.split("Q")), inplace=True
)
df.dropna(inplace=True)
# merge data
tx_data = pd.concat([tx_data, df])
# remove unnecessary variables
del df

# Risk mortality, make it binary
# 1 Minor
# 2 Moderate
# 3 Major
# 4 Extreme
tx_data.RISK_MORTALITY.astype(int)
tx_data.RISK_MORTALITY.replace(1, 0, inplace=True)
tx_data.RISK_MORTALITY.replace(2, 0, inplace=True)
tx_data.RISK_MORTALITY.replace(3, 1, inplace=True)
tx_data.RISK_MORTALITY.replace(4, 1, inplace=True)

# renumber non-numerical codes for cols
cols = ["PRINC_DIAG_CODE", "SOURCE_OF_ADMISSION", "E_CODE_1"]
for col in cols:
tmp = list(
{
x
for x in tx_data[col]
if not str(x).isdigit() and not isinstance(x, float)
} # pylint: disable=consider-using-set-comprehension
)
n = max(
list(
{
int(x)
for x in tx_data[col]
if str(x).isdigit() or isinstance(x, float)
} # pylint: disable=consider-using-set-comprehension
)
)
for i, x in enumerate(tmp):
tx_data[col].replace(x, n + i, inplace=True)
del tmp, n
# set index
tx_data.set_index("RECORD_ID", inplace=True)
# final check and drop of NAs
tx_data.dropna(inplace=True)
# convert all data to numerical
tx_data = tx_data.astype(int)
# save csv file
tx_data.to_csv(os.path.join(data_folder, "TexasHospitals", processed_data_file))
else:
logger.info("Loading processed Texas Hospitals data (2006-2009) csv file.")
# load texas data processed csv file
tx_data = pd.read_csv(
os.path.join(data_folder, "TexasHospitals", processed_data_file)
)

# extract target
var = "RISK_MORTALITY"
labels = tx_data[var]
# Drop the column that contains the labels
tx_data.drop([var], axis=1, inplace=True)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels.values)
labels = pd.DataFrame({var: encoded_labels})

return (tx_data, labels)

0 comments on commit 87d4749

Please sign in to comment.