forked from theislab/sc-pert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update.py
93 lines (78 loc) · 3.48 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import re
import glob
import pandas as pd
### data updates ###
# single-cell database
os.system('rm data.tsv')
os.system('rm personal.csv')
os.system('wget http://www.nxn.se/single-cell-studies/data.tsv')
os.system("wget --no-check-certificate -O personal.csv 'https://docs.google.com/spreadsheets/d/14awt-bCOnj4ca2uoKzuTNuKtUKXcoN82_-oGg2f1Ros/export?gid=1438063781&format=csv'")
# GDSC
#os.system('wget -P /gdsc/ ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/current_release/GDSC1_fitted_dose_response_25Feb20.xlsx')
path = 'datasets/'
processed_datasets = [file for file in glob.glob(f'{path}*.ipynb') if 'curation' not in file]
curated_datasets = [file for file in glob.glob(f'{path}*_curation.ipynb')]
personal_rec = pd.read_csv('personal.csv')
personal_rec['author_year'] = personal_rec['Author'] + '_' + personal_rec['Year'].astype(str)
dois = personal_rec.DOI.values
df = pd.read_csv('data.tsv', sep='\t')
df = df[df.DOI.isin(dois)]
df['Date'] = df['Date'].astype('object') # ensure proper display
not_in_db = list(set(dois) - set(df.DOI.values))
# create placeholders for dois not in the sc studies DB
pdf = personal_rec[personal_rec.DOI.isin(not_in_db)]
df = df.append(pdf[pdf.columns.intersection(df.columns)])
# add additional columns of info
add_cols = personal_rec[['DOI', 'Treatment', '# perturbations', '# cell types', '# doses', '# timepoints', 'Author', 'Year']]
n_cols = 1 - add_cols.shape[1] # default to negative
df = df.merge(
add_cols,
left_on='DOI',
right_on='DOI'
)
df = df[list(df.columns[:5]) + \
list(df.columns[n_cols:]) + list(df.columns[5:n_cols])] # up to `Title`
# convert DOIs to links in markdown
links = []
for shorthand, link in df[['Shorthand', 'DOI']].values:
s = f'[{shorthand}](https://doi.org/{link})'
s = s.replace('et al', '*et al.*')
links.append(s)
df['Shorthand'] = links
# add availability column with download links for curated datasets
links = []
for shorthand, author, year, link in df[['Shorthand', 'Author', 'Year', 'DOI']].values:
s = ''
base_nb_path = 'https://nbviewer.ipython.org/github/theislab/sc-pert/blob/main/'
row = personal_rec[personal_rec.author_year == f'{author}_{year}']
if not pd.isnull(row.Raw.values[0]): # raw .h5ad
s += f' [\\[raw h5ad\\]]({row.Raw.values[0]})'
if not pd.isnull(row.Processed.values[0]): # processed .h5ad
s += f' [\\[processed h5ad\\]]({row.Processed.values[0]})'
## adding notebook paths
r = re.compile(f'{path}{author}_{year}.*')
for nb in filter(r.match, curated_datasets):
s += f' [\\[curation nb\\]]({base_nb_path}{nb})'
for nb in filter(r.match, processed_datasets):
s += f' [\\[procesing nb\\]]({base_nb_path}{nb})'
links.append(s)
df['.h5ad availability'] = links
# clean up
df = df.sort_values(by=['Treatment', 'Date'])
df = df.drop(['Authors', 'Journal', 'DOI', 'bioRxiv DOI', 'Author', 'Year', 'Date'], axis=1)
# rearrange columns
primary_cols = ['Shorthand', 'Title', '.h5ad availability', 'Treatment', '# perturbations', '# cell types', '# doses', '# timepoints']
df = df[primary_cols + \
[c for c in df if c not in primary_cols]]
# write README
filenames = []
with open('README.md', 'w') as outfile:
with open('readme_body.txt') as infile:
outfile.write(infile.read())
md = df.to_markdown(index=False, tablefmt='github', floatfmt='.8g')
md = md.replace('| Title', '| Title'+' '*70)
outfile.write(md)
infile.close()
outfile.close()
df.to_csv('data_table.csv')