From 7f9230acdbc33d32bce6afa656d8809bfc4e370e Mon Sep 17 00:00:00 2001 From: Sara JC Gosline Date: Thu, 14 Sep 2023 06:33:04 -0700 Subject: [PATCH] updated cptac to only download relevant data to cache. --- mRNAData/getAllDatasets.py | 43 ++++++++++++++++++++++++++- mRNAData/mRNADataSetsCLI.py | 42 +++++++++++++++++---------- protData/getAllDatasets.py | 43 ++++++++++++++++++++++++++- protData/protDataSetsCLI.py | 58 ++++++++++++++++++------------------- 4 files changed, 138 insertions(+), 48 deletions(-) diff --git a/mRNAData/getAllDatasets.py b/mRNAData/getAllDatasets.py index e32c62f..bc5a596 100644 --- a/mRNAData/getAllDatasets.py +++ b/mRNAData/getAllDatasets.py @@ -4,5 +4,46 @@ ''' import cptac + + +def getCancerObj(cancertype): + # cptac.download(dataset=cancertype,source='harmonized',) + if cancertype == 'brca': + dat = cptac.Brca() + elif cancertype == 'ccrcc': + dat = cptac.Ccrcc() + elif cancertype == 'coad': + dat = cptac.Coad() + elif cancertype == 'gbm': + dat = cptac.Gbm() + elif cancertype == 'hnscc': + dat = cptac.Hnscc() + elif cancertype == 'lscc': + dat = cptac.Lscc() + elif cancertype == 'luad': + dat = cptac.Luad() + elif cancertype == 'ov': + dat = cptac.Ov() + elif cancertype =='pdac': + dat = cptac.Pdac() + elif cancertype =='ucec': + dat = cptac.Ucec() + else: + print('Wrong cancer type: '+cancertype) + exit() + return dat + + for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']: - cptac.download_cancer(ds) + dat=getCancerObj(ds) + + #this call changed in recent version + dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources'] + clinsource = dat_list['clinical'] + if 'harmonized' in clinsource: + cs = 'harmonized' + else: + cs = clinsource[0] + dat.get_clinical(cs) + tsource = dat_list['transcriptomics'] + dat.get_transcriptomics(tsource[0]) diff --git a/mRNAData/mRNADataSetsCLI.py b/mRNAData/mRNADataSetsCLI.py index 9e8a1ea..fd471b3 100644 --- a/mRNAData/mRNADataSetsCLI.py +++ b/mRNAData/mRNADataSetsCLI.py @@ -37,24 +37,34 @@ def main(): dat = cptac.Pdac() else: exit() - df = dat.get_transcriptomics() + #this call changed in recent version + dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources'] + clinsource = dat_list['clinical'] + if 'harmonized' in clinsource: + cs = 'harmonized' + else: + cs = clinsource[0] + dat.get_clinical(cs) + tsource = dat_list['transcriptomics'] + df = dat.get_transcriptomics(tsource[0]) + # Get the sample type specific dataframe - if opts.sample.lower() != 'all': - meta = dat.get_clinical() - if opts.sample.lower() == 'tumor': - ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index - ind = [i for i in ind if i in df.index] - df = df.loc[ind] - elif opts.sample.lower() == 'normal': - nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index) - nIDs = list(set(nIDs) & set(df.index)) - df = df.loc[nIDs] - df.index = [nID[:-2] if nID[-2:] == - ".N" else nID for nID in nIDs] - else: - exit("The sample type, tumor vs normal vs all (default),\ - is not correctly set!") + # if opts.sample.lower() != 'all': + # meta = dat.get_clinical() + # if opts.sample.lower() == 'tumor': + # ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index + # ind = [i for i in ind if i in df.index] + # df = df.loc[ind] + # elif opts.sample.lower() == 'normal': + # nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index) + # nIDs = list(set(nIDs) & set(df.index)) + # df = df.loc[nIDs] + # df.index = [nID[:-2] if nID[-2:] == + # ".N" else nID for nID in nIDs] + # else: + # exit("The sample type, tumor vs normal vs all (default),\ + # is not correctly set!") df.transpose().to_csv(path_or_buf="file.tsv", sep='\t') diff --git a/protData/getAllDatasets.py b/protData/getAllDatasets.py index e32c62f..b34daf8 100644 --- a/protData/getAllDatasets.py +++ b/protData/getAllDatasets.py @@ -4,5 +4,46 @@ ''' import cptac + + +def getCancerObj(cancertype): + # cptac.download(dataset=cancertype,source='harmonized',) + if cancertype == 'brca': + dat = cptac.Brca() + elif cancertype == 'ccrcc': + dat = cptac.Ccrcc() + elif cancertype == 'coad': + dat = cptac.Coad() + elif cancertype == 'gbm': + dat = cptac.Gbm() + elif cancertype == 'hnscc': + dat = cptac.Hnscc() + elif cancertype == 'lscc': + dat = cptac.Lscc() + elif cancertype == 'luad': + dat = cptac.Luad() + elif cancertype == 'ov': + dat = cptac.Ov() + elif cancertype =='pdac': + dat = cptac.Pdac() + elif cancertype =='ucec': + dat = cptac.Ucec() + else: + print('Wrong cancer type: '+cancertype) + exit() + return dat + + for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']: - cptac.download_cancer(ds) + dat=getCancerObj(ds) + + #this call changed in recent version + dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources'] + clinsource = dat_list['clinical'] + if 'harmonized' in clinsource: + cs = 'harmonized' + else: + cs = clinsource[0] + dat.get_clinical(cs) + tsource = dat_list['proteomics'] + dat.get_proteomics(tsource[0]) diff --git a/protData/protDataSetsCLI.py b/protData/protDataSetsCLI.py index ecdc21d..efa5429 100644 --- a/protData/protDataSetsCLI.py +++ b/protData/protDataSetsCLI.py @@ -4,7 +4,6 @@ ''' import argparse import cptac -import numpy as np def main(): @@ -35,39 +34,38 @@ def main(): elif opts.type.lower() == 'ovarian': dat = cptac.Ov() elif opts.type.loewr() == 'pdac': - dat = cptac.Pdac() + dat = cptac.Pdac() else: exit() - df = dat.get_proteomics() + #this call changed in recent version + dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources'] + clinsource = dat_list['clinical'] + if 'harmonized' in clinsource: + cs = 'harmonized' + else: + cs = clinsource[0] + dat.get_clinical(cs) + tsource = dat_list['proteomics'] + df = dat.get_proteomics(tsource[0]) + # Get the sample type specific dataframe - if opts.sample.lower() != 'all': - meta = dat.get_clinical() - if opts.sample.lower() == 'tumor': - ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index - ind = [i for i in ind if i in df.index] - df = df.loc[ind] - elif opts.sample.lower() == 'normal': - nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index) - nIDs = list(set(nIDs) & set(df.index)) - df = df.loc[nIDs] - df.index = [nID[:-2] if nID[-2:] == - ".N" else nID for nID in nIDs] - else: - exit("The sample type, tumor vs normal vs all (default), \ - is not correctly set!") - - # some dataset has two level of indices some has only one - if df.columns.nlevels == 2: - df.columns = df.columns.droplevel(1) - elif df.columns.nlevels != 1: - print("The number of column levels is larger not 1 or 2!\n") - raise - dfE = np.exp(df) -# dfU = np.log(dfE.sum(axis=1, level=0, min_count=1)) - dfU = np.log(dfE.groupby(axis=1, level=0).sum())#sum(axis=1, level=0, min_count=1)) - dfU.dropna(how='all', axis=0, inplace=True) - dfU.transpose().to_csv(path_or_buf="file.tsv", sep='\t') + # if opts.sample.lower() != 'all': + # meta = dat.get_clinical() + # if opts.sample.lower() == 'tumor': + # ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index + # ind = [i for i in ind if i in df.index] + # df = df.loc[ind] + # elif opts.sample.lower() == 'normal': + # nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index) + # nIDs = list(set(nIDs) & set(df.index)) + # df = df.loc[nIDs] + # df.index = [nID[:-2] if nID[-2:] == + # ".N" else nID for nID in nIDs] + # else: + # exit("The sample type, tumor vs normal vs all (default),\ + # is not correctly set!") + df.transpose().to_csv(path_or_buf="file.tsv", sep='\t') if __name__ == '__main__':