Skip to content

Commit

Permalink
updated cptac to only download relevant data to cache.
Browse files Browse the repository at this point in the history
  • Loading branch information
sgosline committed Sep 14, 2023
1 parent d13fae7 commit 7f9230a
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 48 deletions.
43 changes: 42 additions & 1 deletion mRNAData/getAllDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,46 @@
'''

import cptac


def getCancerObj(cancertype):
# cptac.download(dataset=cancertype,source='harmonized',)
if cancertype == 'brca':
dat = cptac.Brca()
elif cancertype == 'ccrcc':
dat = cptac.Ccrcc()
elif cancertype == 'coad':
dat = cptac.Coad()
elif cancertype == 'gbm':
dat = cptac.Gbm()
elif cancertype == 'hnscc':
dat = cptac.Hnscc()
elif cancertype == 'lscc':
dat = cptac.Lscc()
elif cancertype == 'luad':
dat = cptac.Luad()
elif cancertype == 'ov':
dat = cptac.Ov()
elif cancertype =='pdac':
dat = cptac.Pdac()
elif cancertype =='ucec':
dat = cptac.Ucec()
else:
print('Wrong cancer type: '+cancertype)
exit()
return dat


for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']:
cptac.download_cancer(ds)
dat=getCancerObj(ds)

#this call changed in recent version
dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
clinsource = dat_list['clinical']
if 'harmonized' in clinsource:
cs = 'harmonized'
else:
cs = clinsource[0]
dat.get_clinical(cs)
tsource = dat_list['transcriptomics']
dat.get_transcriptomics(tsource[0])
42 changes: 26 additions & 16 deletions mRNAData/mRNADataSetsCLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,34 @@ def main():
dat = cptac.Pdac()
else:
exit()
df = dat.get_transcriptomics()
#this call changed in recent version
dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
clinsource = dat_list['clinical']
if 'harmonized' in clinsource:
cs = 'harmonized'
else:
cs = clinsource[0]
dat.get_clinical(cs)
tsource = dat_list['transcriptomics']
df = dat.get_transcriptomics(tsource[0])


# Get the sample type specific dataframe
if opts.sample.lower() != 'all':
meta = dat.get_clinical()
if opts.sample.lower() == 'tumor':
ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
ind = [i for i in ind if i in df.index]
df = df.loc[ind]
elif opts.sample.lower() == 'normal':
nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
nIDs = list(set(nIDs) & set(df.index))
df = df.loc[nIDs]
df.index = [nID[:-2] if nID[-2:] ==
".N" else nID for nID in nIDs]
else:
exit("The sample type, tumor vs normal vs all (default),\
is not correctly set!")
# if opts.sample.lower() != 'all':
# meta = dat.get_clinical()
# if opts.sample.lower() == 'tumor':
# ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
# ind = [i for i in ind if i in df.index]
# df = df.loc[ind]
# elif opts.sample.lower() == 'normal':
# nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
# nIDs = list(set(nIDs) & set(df.index))
# df = df.loc[nIDs]
# df.index = [nID[:-2] if nID[-2:] ==
# ".N" else nID for nID in nIDs]
# else:
# exit("The sample type, tumor vs normal vs all (default),\
# is not correctly set!")
df.transpose().to_csv(path_or_buf="file.tsv", sep='\t')


Expand Down
43 changes: 42 additions & 1 deletion protData/getAllDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,46 @@
'''

import cptac


def getCancerObj(cancertype):
# cptac.download(dataset=cancertype,source='harmonized',)
if cancertype == 'brca':
dat = cptac.Brca()
elif cancertype == 'ccrcc':
dat = cptac.Ccrcc()
elif cancertype == 'coad':
dat = cptac.Coad()
elif cancertype == 'gbm':
dat = cptac.Gbm()
elif cancertype == 'hnscc':
dat = cptac.Hnscc()
elif cancertype == 'lscc':
dat = cptac.Lscc()
elif cancertype == 'luad':
dat = cptac.Luad()
elif cancertype == 'ov':
dat = cptac.Ov()
elif cancertype =='pdac':
dat = cptac.Pdac()
elif cancertype =='ucec':
dat = cptac.Ucec()
else:
print('Wrong cancer type: '+cancertype)
exit()
return dat


for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']:
cptac.download_cancer(ds)
dat=getCancerObj(ds)

#this call changed in recent version
dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
clinsource = dat_list['clinical']
if 'harmonized' in clinsource:
cs = 'harmonized'
else:
cs = clinsource[0]
dat.get_clinical(cs)
tsource = dat_list['proteomics']
dat.get_proteomics(tsource[0])
58 changes: 28 additions & 30 deletions protData/protDataSetsCLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
'''
import argparse
import cptac
import numpy as np


def main():
Expand Down Expand Up @@ -35,39 +34,38 @@ def main():
elif opts.type.lower() == 'ovarian':
dat = cptac.Ov()
elif opts.type.loewr() == 'pdac':
dat = cptac.Pdac()
dat = cptac.Pdac()
else:
exit()
df = dat.get_proteomics()
#this call changed in recent version
dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
clinsource = dat_list['clinical']
if 'harmonized' in clinsource:
cs = 'harmonized'
else:
cs = clinsource[0]
dat.get_clinical(cs)
tsource = dat_list['proteomics']
df = dat.get_proteomics(tsource[0])


# Get the sample type specific dataframe
if opts.sample.lower() != 'all':
meta = dat.get_clinical()
if opts.sample.lower() == 'tumor':
ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
ind = [i for i in ind if i in df.index]
df = df.loc[ind]
elif opts.sample.lower() == 'normal':
nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
nIDs = list(set(nIDs) & set(df.index))
df = df.loc[nIDs]
df.index = [nID[:-2] if nID[-2:] ==
".N" else nID for nID in nIDs]
else:
exit("The sample type, tumor vs normal vs all (default), \
is not correctly set!")

# some dataset has two level of indices some has only one
if df.columns.nlevels == 2:
df.columns = df.columns.droplevel(1)
elif df.columns.nlevels != 1:
print("The number of column levels is larger not 1 or 2!\n")
raise
dfE = np.exp(df)
# dfU = np.log(dfE.sum(axis=1, level=0, min_count=1))
dfU = np.log(dfE.groupby(axis=1, level=0).sum())#sum(axis=1, level=0, min_count=1))
dfU.dropna(how='all', axis=0, inplace=True)
dfU.transpose().to_csv(path_or_buf="file.tsv", sep='\t')
# if opts.sample.lower() != 'all':
# meta = dat.get_clinical()
# if opts.sample.lower() == 'tumor':
# ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
# ind = [i for i in ind if i in df.index]
# df = df.loc[ind]
# elif opts.sample.lower() == 'normal':
# nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
# nIDs = list(set(nIDs) & set(df.index))
# df = df.loc[nIDs]
# df.index = [nID[:-2] if nID[-2:] ==
# ".N" else nID for nID in nIDs]
# else:
# exit("The sample type, tumor vs normal vs all (default),\
# is not correctly set!")
df.transpose().to_csv(path_or_buf="file.tsv", sep='\t')


if __name__ == '__main__':
Expand Down

0 comments on commit 7f9230a

Please sign in to comment.