-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtex.py
198 lines (164 loc) · 7.92 KB
/
gtex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python
import ssl
import urllib
import urllib.request
import click
import polars as pl
from pycomfort.files import *
from rna_clock.train import train_gtex
from rna_clock.config import Locations
from rna_clock.tune import tune_gtex
locations = Locations(Path("..") if Path(".").name == "rna_clock" else Path("."))
proxy_handler = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
try:
ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = ssl._create_unverified_context
@click.group()
def app():
print("running rna clock application")
def un_gzip(fl: Path):
import gzip
import shutil
with gzip.open(str(fl), 'rb') as f_in:
with open(str(fl).replace(".gz", ""), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
@app.command()
def clean():
print("cleaning files")
def download_gtex_single_cell(skit_if_exist: bool = True):
print("downloading single cell")
url = "https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad"
output_file = locations.gtex_input / "GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad"
if skit_if_exist and output_file.exists():
print("scRNA-Seq file exists, skipping downloading it")
else:
urllib.request.urlretrieve(url, output_file)
immune_url = "https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad"
immune_file = locations.gtex_input / "GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad"
if skit_if_exist and immune_file.exists():
print("skipping immune downloads")
else:
urllib.request.urlretrieve(immune_url, immune_file)
def stabilize_rna(path: Path) -> pl.DataFrame:
print("preparing stable ids for RNA")
stable_gene = pl.col("Name").str.split(".").alias("Ensembl").apply(lambda s: s[0])
rnas = pl.read_csv(path , comment_char="#", sep="\t", skip_rows=2)
return rnas.select([
stable_gene, pl.all()
]).drop(["Name", "Description"]).groupby(pl.col("Ensembl")).agg(pl.all().sum())
def download_cattle_counts(skip_if_exist: bool = True):
#url = "https://cgtex.roslin.ed.ac.uk/wp-content/plugins/cgtex/static/rawdata/Gene_read_counts_FarmGTEx_cattle_V0.txt.txt.gz"
url = "/home/antonkulaga/Downloads/TPM.8742samples_27607genes.zip"
#counts_name = "Gene_read_counts_FarmGTEx_cattle_V0.tsv"
counts_name = "TPM.8742samples_27607genes.zip"
counts_file_gz = locations.cattle_input / (counts_name + ".gz")
counts_file = locations.cattle_input / counts_name
if skip_if_exist and counts_file.exists():
print("file exists, skipping")
else:
if not counts_file_gz.exists():
urllib.request.urlretrieve(url, counts_file_gz)
un_gzip(counts_file_gz)
counts_file_gz.unlink(missing_ok=True)
print("cattle download finished")
def download_gtex_tpms(skip_if_exist: bool = True):
url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
rna_name = "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct"
stable_rna_file = locations.gtex_input / rna_name.replace("_tpm", "_tpm_stable")
rna_file_gz = locations.gtex_input / (rna_name + ".gz")
rna_file = locations.gtex_input / rna_name
if skip_if_exist and stable_rna_file.exists():
print("file exists, skipping")
else:
if not rna_file_gz.exists():
urllib.request.urlretrieve(url, rna_file_gz)
un_gzip(rna_file_gz)
rna_file_gz.unlink(missing_ok=True)
#with open(rna_file, 'r+') as fp:
# lines = fp.readlines()
# fp.seek(0)
# fp.truncate()
# fp.writelines(lines[2:])
stable_rnas = stabilize_rna(rna_file)
stable_rnas.write_csv(str(stable_rna_file), sep="\t")
rna_file.unlink(missing_ok=True)
print(f"produced stable rna file at {str(stable_rna_file)}")
transpose_gtex_bulk(stable_rna_file.name, skip_if_exist)
def transpose_gtex_bulk(rna_name: str, skip_if_exist: bool = True):
print("transposing bulk gtex data")
gct = str(locations.gtex_input / rna_name)
exp_name = rna_name.replace(".gct", "_expressions.tsv")
exp_path = locations.gtex_interim / exp_name
if skip_if_exist and exp_path.exists():
print("skipping transposing")
else:
print("transposing gtex bulk rna-seq")
import subprocess
comm = f"datamash transpose < {gct} > {str(exp_path)}"
print(f"command to run: {comm}")
transpose = subprocess.run(comm, shell=True)
print("The exit code was: %d" % transpose.returncode)
print(f"transpose of {exp_path.absolute().resolve()}")
def download_gtex_sample(skip_if_exist: bool = True):
print("downloading gtex samples info")
sample_url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
sample_file = locations.gtex_input / "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
urllib.request.urlretrieve(sample_url, sample_file)
phenotypes_url = "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
phenotypes_file = locations.gtex_input / "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
urllib.request.urlretrieve(phenotypes_url, phenotypes_file)
def download_gtex_bulk(skip_if_exist: bool = True):
print("downloading bulk")
download_gtex_tpms(skip_if_exist)
download_gtex_sample(skip_if_exist)
def download_gtex():
print("downloading gtex files")
download_gtex_single_cell()
download_gtex_bulk()
@app.command()
def download():
download_gtex()
#def with_subjects(exp: pl.DataFrame):
# pl.col("my").str.split(",").arr.
# pl.col("my_column").where()
def with_subjects(exp: pl.DataFrame):
samples_with_subjects_path = locations.gtex_interim / "samples_with_subjects.parquet"
samples_with_subjects = pl.read_parquet(samples_with_subjects_path)
return samples_with_subjects.join(exp, left_on="SAMPID", right_on="Ensembl")
@app.command()
def prepare_bulk():
sample_file = locations.gtex_input / "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
phenotypes_file = locations.gtex_input / "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
samples = pl.read_csv(sample_file, comment_char="#", sep="\t", ignore_errors=True)
phenotypes = pl.read_csv(phenotypes_file, comment_char="#", sep="\t", ignore_errors=True)
medium_age = pl.col("AGE").str.split("-").alias("medium_age").apply(lambda s: (int(s[1])+int(s[0]))/2.0)
phenotypes_extended = phenotypes.with_column(medium_age)
col_sample = pl.col("SAMPID")
col_get_subject = col_sample.apply(lambda s: s.split("-")[0]+"-"+s.split("-")[1]).alias("SUBJID")
samples_with_subjects = phenotypes_extended.join(samples.with_column(col_get_subject), left_on="SUBJID", right_on="SUBJID")
#saving results
samples_with_subjects_path = locations.gtex_interim / "samples_with_subjects.parquet"
samples_with_subjects.write_parquet(samples_with_subjects_path)
print(f"files saved to {samples_with_subjects_path}")
exp_path = locations.gtex_interim / "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm_stable_expressions.tsv"
print(f"loading expressions from {exp_path}")
expressions = pl.read_csv(str(exp_path), comment_char="#", sep="\t", ignore_errors=True)
expressions_extended = with_subjects(expressions)
expressions_extended.write_parquet(locations.gtex_interim / "expressions_extended.parquet")
@app.command()
def prepare_cattle_bulk():
download_cattle_counts()
@app.command()
def tune():
tune_gtex(locations)
@app.command()
def train():
train_gtex(locations)
if __name__ == '__main__':
app()