Skip to content

Commit 6e5a402

Browse files
Merge pull request #4 from bbglab/dev/MANE_implementation
Adapt BoostDM pipeline to accept MANE transcript
2 parents f603df1 + 167fdc0 commit 6e5a402

File tree

10 files changed

+293
-33
lines changed

10 files changed

+293
-33
lines changed

.gitignore

+201
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
# Created by https://www.toptal.com/developers/gitignore/api/python
2+
# Edit at https://www.toptal.com/developers/gitignore?templates=python
3+
4+
### Python ###
5+
# Byte-compiled / optimized / DLL files
6+
__pycache__/
7+
*.py[cod]
8+
*$py.class
9+
10+
# C extensions
11+
*.so
12+
13+
# Distribution / packaging
14+
.Python
15+
build/
16+
develop-eggs/
17+
dist/
18+
downloads/
19+
eggs/
20+
.eggs/
21+
lib/
22+
lib64/
23+
parts/
24+
sdist/
25+
var/
26+
wheels/
27+
share/python-wheels/
28+
*.egg-info/
29+
.installed.cfg
30+
*.egg
31+
MANIFEST
32+
33+
# PyInstaller
34+
# Usually these files are written by a python script from a template
35+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
36+
*.manifest
37+
*.spec
38+
39+
# Installer logs
40+
pip-log.txt
41+
pip-delete-this-directory.txt
42+
43+
# Unit test / coverage reports
44+
htmlcov/
45+
.tox/
46+
.nox/
47+
.coverage
48+
.coverage.*
49+
.cache
50+
nosetests.xml
51+
coverage.xml
52+
*.cover
53+
*.py,cover
54+
.hypothesis/
55+
.pytest_cache/
56+
cover/
57+
58+
# Translations
59+
*.mo
60+
*.pot
61+
62+
# Django stuff:
63+
*.log
64+
local_settings.py
65+
db.sqlite3
66+
db.sqlite3-journal
67+
68+
# Flask stuff:
69+
instance/
70+
.webassets-cache
71+
72+
# Scrapy stuff:
73+
.scrapy
74+
75+
# Sphinx documentation
76+
docs/_build/
77+
78+
# PyBuilder
79+
.pybuilder/
80+
target/
81+
82+
# Jupyter Notebook
83+
.ipynb_checkpoints
84+
85+
# IPython
86+
profile_default/
87+
ipython_config.py
88+
89+
# pyenv
90+
# For a library or package, you might want to ignore these files since the code is
91+
# intended to run in multiple environments; otherwise, check them in:
92+
# .python-version
93+
94+
# pipenv
95+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
97+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
98+
# install all needed dependencies.
99+
#Pipfile.lock
100+
101+
# poetry
102+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103+
# This is especially recommended for binary packages to ensure reproducibility, and is more
104+
# commonly ignored for libraries.
105+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106+
#poetry.lock
107+
108+
# pdm
109+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110+
#pdm.lock
111+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112+
# in version control.
113+
# https://pdm.fming.dev/#use-with-ide
114+
.pdm.toml
115+
116+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117+
__pypackages__/
118+
119+
# Celery stuff
120+
celerybeat-schedule
121+
celerybeat.pid
122+
123+
# SageMath parsed files
124+
*.sage.py
125+
126+
# Environments
127+
.env
128+
.venv
129+
env/
130+
venv/
131+
ENV/
132+
env.bak/
133+
venv.bak/
134+
135+
# Spyder project settings
136+
.spyderproject
137+
.spyproject
138+
139+
# Rope project settings
140+
.ropeproject
141+
142+
# mkdocs documentation
143+
/site
144+
145+
# mypy
146+
.mypy_cache/
147+
.dmypy.json
148+
dmypy.json
149+
150+
# Pyre type checker
151+
.pyre/
152+
153+
# pytype static type analyzer
154+
.pytype/
155+
156+
# Cython debug symbols
157+
cython_debug/
158+
159+
# PyCharm
160+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162+
# and can be added to the global gitignore or merged into this file. For a more nuclear
163+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
164+
#.idea/
165+
166+
### Python Patch ###
167+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168+
poetry.toml
169+
170+
# ruff
171+
.ruff_cache/
172+
173+
# LSP config files
174+
pyrightconfig.json
175+
176+
# End of https://www.toptal.com/developers/gitignore/api/python
177+
178+
# vscode
179+
.vscode/**
180+
!.vscode/launch.json
181+
182+
# nextflow
183+
work/
184+
185+
## nextflow hidden files
186+
.nextflow
187+
.nextflow.log*
188+
189+
## nextflow output
190+
trace*
191+
timeline*
192+
report*
193+
194+
195+
# old file
196+
*_old*
197+
_trash
198+
199+
# test directory
200+
tests/output
201+
database

.vscode/launch.json

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "CreateDatasets",
9+
"type": "python",
10+
"request": "launch",
11+
"program": "${workspaceFolder}/containers_build/boostdm/annotations/cohort.py",
12+
"console": "integratedTerminal",
13+
"justMyCode": true,
14+
"cwd": "${workspaceFolder}/tests/output",
15+
"env": {
16+
"GENOME_BUILD": "hg38",
17+
"INTOGEN_DATASETS": "/workspace/datasets/intogen/runs/v2024/20240409_ALL/",
18+
"BOOSTDM_DIR": "/workspace/datasets/boostdm_runs/boostdm-cancer-output-2023",
19+
"BOOSTDM_DATASETS": "/workspace/projects/intogen_plus/fixdatasets-20230223/containers/datasets_24/boostdm"
20+
},
21+
"args": [
22+
"--cohort",
23+
"TCGA_WXS_BRCA",
24+
"--dndscv-path",
25+
"/workspace/datasets/intogen/runs/v2024/20240409_ALL/steps/dndscv/TCGA_WXS_BRCA.dndscv.tsv.gz",
26+
"--dndscv-annotmuts-path",
27+
"/workspace/datasets/intogen/runs/v2024/20240409_ALL/steps/dndscv/TCGA_WXS_BRCA.dndscv_annotmuts.tsv.gz",
28+
"--mutrate-path",
29+
"/workspace/datasets/intogen/runs/v2024/20240409_ALL/steps/boostDM/mutrate/TCGA_WXS_BRCA.mutrate.json",
30+
"--clustl-group-path",
31+
"/workspace/datasets/boostdm_runs/boostdm-cancer-output-2023/output_20230710/features_group/clustl.tsv.gz",
32+
"--hotmaps-group-path",
33+
"/workspace/datasets/boostdm_runs/boostdm-cancer-output-2023/output_20230710/features_group/hotmaps.tsv.gz",
34+
"--smregions-group-path",
35+
"/workspace/datasets/boostdm_runs/boostdm-cancer-output-2023/output_20230710/features_group/smregions.tsv.gz",
36+
"--splits",
37+
"50",
38+
"--threshold",
39+
"0.85",
40+
"--out",
41+
"TCGA_WXS_BRCA.regression_data.tsv"
42+
]
43+
},
44+
]
45+
}

containers_build/boostdm/annotations/cohort.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from boostdm import BoostDMError
1212
from boostdm.annotations.utils import encode_consequence_type, rectify_synonymous, rectify_missense, rectify_splicing
13-
from boostdm.globals import CANONICAL_TRANSCRIPTS_FILE, MNVS_FILE, COHORTS_PATH, DRIVERS_PATH
13+
from boostdm.globals import MANE_TRANSCRIPTS_FILE, MNVS_FILE, COHORTS_PATH, DRIVERS_PATH
1414
from boostdm.oncotree import Oncotree
1515
from boostdm.features import phylop, consequence_type, aachange, exon, ptms, clustl, hotmaps, smregions, dndscv
1616
from boostdm.passengers import retrieve_exons, randomize
@@ -68,7 +68,7 @@ def retrieve_expectation(exp_dict, v):
6868
def set_string_chr(row):
6969
try:
7070
return str(int(row["chr"]))
71-
except:
71+
except ValueError:
7272
return str(row["chr"])
7373

7474

@@ -149,16 +149,16 @@ def mnvs_to_remove():
149149

150150
def retrieve_transcript():
151151

152-
"""Returns dataframe with canonical transcript regions"""
152+
"""Returns dataframe with mane transcript regions (cds + 25bp for splicing)"""
153153

154-
canonical_transcript_df = pd.read_csv(CANONICAL_TRANSCRIPTS_FILE,
154+
mane_transcript_df = pd.read_csv(MANE_TRANSCRIPTS_FILE,
155155
sep='\t', header=None, compression='gzip', low_memory=False, skiprows=1)
156156

157157
# TODO: verify the columns we are selecting are the right ones
158158

159-
canonical_transcript_df = canonical_transcript_df[[0, 1, 2, 6]].copy()
160-
canonical_transcript_df.columns = ['chr', 'start', 'end', 'gene']
161-
return canonical_transcript_df
159+
mane_transcript_df = mane_transcript_df[[0, 1, 2, 6]].copy()
160+
mane_transcript_df.columns = ['chr', 'start', 'end', 'gene']
161+
return mane_transcript_df
162162

163163

164164
def intersect_region_mutations(cds, pos):
@@ -228,8 +228,8 @@ def initialize_trainset(df, drivers):
228228

229229
def build_positive_set(df_expect):
230230

231-
canonical_transcript = retrieve_transcript()
232-
pos = intersect_region_mutations(canonical_transcript, df_expect)
231+
mane_transcript = retrieve_transcript()
232+
pos = intersect_region_mutations(mane_transcript, df_expect)
233233
pos['response'] = 1
234234
return pos
235235

containers_build/boostdm/annotations/gene.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def read_muts(path_data):
2727
'Canonical': 'CANONICAL'}, inplace=True)
2828

2929

30-
muts = muts[muts['CANONICAL'] == 'YES']
30+
muts = muts[muts['MANE_SELECT'] != '-']
3131
if muts.shape[0] == 0:
3232
raise Exception('There are not mutations in the canonical transcript')
3333

containers_build/boostdm/features/aachange.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
def get_aachange(chr_, pos, alt, gene, reader):
1111

1212
for data in reader.get(chr_, pos, pos):
13-
alt_vep = (data[3] == alt)
14-
canonical_vep = (data[-4] == 'YES')
15-
correct_gene = (data[-7] == gene) # skip cases with antisense overlapping gene (gene is gene_symbol)
16-
if alt_vep and canonical_vep and correct_gene:
17-
aas = data[11] # [11] -> amino-acids involved in change ("I/T")
18-
aa_pos = data[10] # [10] -> amino-acid position
13+
alt_vep = (data['ALT'] == alt)
14+
mane_vep = (data['MANE_SELECT'] != '-') # impose MANE transcript
15+
correct_gene = (data['SYMBOL'] == gene) # skip cases with antisense overlapping gene (gene is gene_symbol)
16+
if alt_vep and mane_vep and correct_gene:
17+
aas = data['AA'] # [11] -> amino-acids involved in change ("I/T")
18+
aa_pos = data['PROT_POS'] # [10] -> amino-acid position
1919
if '/' in aas:
2020
aa_ref, aa_alt = tuple(aas.split('/'))
2121
return aa_ref + aa_pos + aa_alt

containers_build/boostdm/features/consequence_type.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ def get_csqn_type(chr_, pos, alt, gene, reader):
99

1010
for data in reader.get(chr_, pos, pos):
1111

12-
alt_vep = (data[3] == alt) # same alternate allele
13-
canonical_vep = (data[-4] == 'YES') # impose canonical transcript
14-
correct_gene = (data[-7] == gene) # skip cases with antisense overlapping genes
15-
if alt_vep and canonical_vep and correct_gene:
16-
csqn = CONSEQUENCES_LIST[min([CONSEQUENCES_DICT[c] for c in data[7].split(',')])]
12+
alt_vep = (data['ALT'] == alt) # same alternate allele
13+
mane_vep = (data["MANE_SELECT"] != '-') # impose mane transcript
14+
correct_gene = (data["SYMBOL"] == gene) # skip cases with antisense overlapping genes
15+
if alt_vep and mane_vep and correct_gene:
16+
csqn = CONSEQUENCES_LIST[min([CONSEQUENCES_DICT[c] for c in data["CNSQ"].split(',')])]
1717
return AGGREGATION_DICT.get(csqn, None)
1818

1919
return None

containers_build/boostdm/features/exon.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ def nmd_rule(exon, total_exons):
2626
def get_exon(chr_, pos, alt,gene, reader):
2727

2828
for data in reader.get(chr_, pos, pos):
29-
alt_vep = (data[3] == alt)
30-
canonical_vep = (data[-4] == 'YES')
31-
correct_gene = (data[-7] == gene) # skip cases with antisense overlapping gene
32-
if alt_vep and canonical_vep and correct_gene:
33-
exons = data[-2]
29+
alt_vep = (data["ALT"] == alt)
30+
mane_vep = (data["MANE_SELECT"] != '-') # impose mane transcript
31+
correct_gene = (data["SYMBOL"] == gene) # skip cases with antisense overlapping gene
32+
if alt_vep and mane_vep and correct_gene:
33+
exons = data["EXON"]
3434
if '/' in exons:
3535
exon, total_exons = tuple(exons.split('/'))
3636
else:

0 commit comments

Comments
 (0)