-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Module/igv/1.0 #303
base: master
Are you sure you want to change the base?
Module/igv/1.0 #303
Changes from all commits
bcf9916
f2091a4
f6980b9
e54b85c
f46ff3f
d0693a6
8fa8bab
d3b8e1c
3791dd1
e0c184d
07bea16
ed501fb
2bb98e7
ff248e6
e389982
31cba38
77f9d76
0bf93ad
f0047d5
a9a27fe
28c54f1
4318b8d
7805531
0f4c924
383ff17
13d6992
c9a8abc
6bb0861
351019f
9764101
874d594
4c34334
901ab8d
c229e74
a5e04c4
bf0154e
b151070
0e9e62c
80dcd76
774ed17
cae1f6e
a376b4b
9490642
63ca909
58529cc
5da0a49
95900fd
7a9b0f3
2054e41
aa2d167
b5ff4f9
6717cd7
60b84e8
c5f443f
c5444f6
bf68f6c
1f305bc
d4d163d
00ff6dd
7686bf6
2ac9e62
2a5462b
0491dc0
c4c1290
dba7383
94f20a8
41c5eff
636dc7b
1f4ecfa
f2a5fe3
1af181b
838efdf
28ad3c2
e6a1c3c
6f71a23
c06b404
1b9facb
6a2a468
fe60b94
e76c9f6
801f308
5f971b7
d35faaa
91c0355
1665b7f
8b3c520
91df3cf
d7c8444
97ff367
9573e59
ad03a99
8d0af4e
f02f0fb
870068a
4ac8628
c447af1
ba82c64
b04e6ce
9aee267
5971917
57e96e0
d6eabda
a7f33e7
ff04705
e62ce63
064ac93
d8fa50c
f565b30
95f8f27
0a5efd7
406f253
f3e3183
fea42ec
b0a69bf
50bfa6c
746b073
7c536f1
27363e9
24ea395
0ee08cb
f2f6807
e044266
c1f3806
0a7b028
0f21edc
4e01bac
a821032
e48e65e
7d462f7
931a8b5
e922df5
2cdf675
7290191
5b9551c
f864881
7df9f19
ef47a17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
lcr-modules: | ||
|
||
igv: | ||
|
||
inputs: | ||
# Available wildcards: {seq_type} {tumour_id} {normal_sample_id} {pair_status} {genome_build} | ||
maf: "__UPDATE__" | ||
|
||
# Available wildcards: {seq_type} {sample_id} {genome_build} | ||
bam_path: "__UPDATE__" | ||
bai_path: "__UPDATE__" | ||
|
||
|
||
regions: | ||
# Provide regions files as lists in their respective genome builds so that liftover of coordinates occurs properly | ||
# Please provide at least one regions file to filter MAF variants | ||
oncodriveclustl: | ||
grch37: ["__UPDATE__"] | ||
hg38: [] | ||
hotmaps: | ||
grch37: [] | ||
hg38: [] | ||
bed: | ||
grch37: [] | ||
hg38: [] | ||
maf: | ||
grch37: [] | ||
hg38: [] | ||
mutation_id: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add here the example formatting of what is expected in that file? Does it have to have a header and expects certain column names? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done ! I added what column and format is required for mutation_id file format |
||
# mutation_id format: minimum requirements are header containing "mutation_id_{regions_build}" column with values in {chr}:{pos} format | ||
# e.g | ||
# mutation_id_grch37 | ||
# chr22:23230361 | ||
grch37: [] # e.g at minimum requires column mutation_id_grch37 | ||
hg38: [] # e.g at minimum requires column mutation_id_hg38 | ||
|
||
# Stop snakefile after MAF filtering step to estimate total number of snapshots that will be taken without running IGV | ||
estimate_only: False | ||
|
||
options: | ||
|
||
igv_version: "https://data.broadinstitute.org/igv/projects/downloads/2.7/IGV_Linux_2.7.2.zip" | ||
|
||
genome_map: | ||
# Maps metadata builds to either grch37 or hg38 so that MAF file locations are determined correctly. Additional genome builds can be added as necessary. | ||
grch37: ["grch37","hg19","hs37d5"] | ||
hg38: ["hg38","grch38"] | ||
|
||
liftover_regions: | ||
liftover_minMatch: "0.95" # Float number from 0 to 1 indicating minimal mapping when converting to a different genome build | ||
|
||
generate_batch_script: | ||
padding: 100 # Base pairs upstream and downstream of variant position | ||
max_height: 1000 # Maximum height of snapshot | ||
sleep_timer: 2000 # Batch scripts with more options may require longer sleep intervals | ||
igv_options: | ||
# Presets for IGV snapshots | ||
# Available igv options: https://github.com/igvteam/igv/wiki/Batch-commands | ||
default: ["preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort"] | ||
pairs: ["viewaspairs", "preference SAM.COLOR_BY READ_STRAND", "preference SAM.SHOW_CENTER_LINE TRUE", "preference SAM.SHADE_BASE_QUALITY true", "preference SAM.DOWNSAMPLE_READS FALSE", "preference SAM.ALLELE_THRESHOLD 0.05", "sort QUALITY"] | ||
|
||
igv_presets: ["default"] # Available options: "default" "pairs" | ||
|
||
xvfb_parameters: | ||
# Server options for running xvfb | ||
server_number: "99" | ||
server_args: "" | ||
|
||
quality_control: | ||
# Truncated heights that have been previously observed for dimensions 1920x1080x24 | ||
truncated: [506,533,545,547,559,570] | ||
# Kurtosis and skewness values observed in blank snapshots at different height values | ||
blank: | ||
"547": | ||
kurtosis: 18.5 | ||
skewness: -4 | ||
"559": | ||
kurtosis: 18.2 | ||
skewness: -4 | ||
# Previously observed heights of snapshots that fail IGV | ||
failed: [506,533] | ||
|
||
scripts: | ||
format_regions: "etc/format_regions.py" | ||
filter_script: "etc/filter_maf.py" | ||
region_liftover_script: "{SCRIPTSDIR}/liftover/1.0/liftover.sh" | ||
batch_script_per_variant: "etc/generate_batch_script_per_variant.py" | ||
quality_control: "etc/quality_control.py" | ||
|
||
scratch_subdirectories: [] | ||
|
||
conda_envs: | ||
liftover: "{SCRIPTSDIR}/liftover/1.0/liftover.yaml" | ||
wget: "{MODSDIR}/envs/wget/wget-1.20.1.yaml" | ||
|
||
threads: 4 | ||
|
||
resources: | ||
_igv_liftover_regions: | ||
mem_mb: 2000 | ||
_igv_run: | ||
mem_mb: 2500 | ||
_igv_quality_control: | ||
mem_mb: 2500 | ||
|
||
pairing_config: | ||
genome: | ||
run_paired_tumours: True | ||
run_unpaired_tumours_with: "unmatched_normal" | ||
run_paired_tumours_as_unpaired: False | ||
capture: | ||
run_paired_tumours: True | ||
run_unpaired_tumours_with: "unmatched_normal" | ||
run_paired_tumours_as_unpaired: False | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../../envs/wget/wget-1.20.1.yaml |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#!/usr/bin/env python | ||
|
||
import os | ||
import sys | ||
import logging | ||
import traceback | ||
import pandas as pd | ||
import oncopipe as op | ||
|
||
|
||
def log_exceptions(exctype, value, tb): | ||
logging.critical(''.join(traceback.format_tb(tb))) | ||
logging.critical('{0}: {1}'.format(exctype, value)) | ||
|
||
sys.excepthook = log_exceptions | ||
|
||
def main(): | ||
|
||
with open(snakemake.log[0], "w") as stdout: | ||
# Set up logging | ||
sys.stdout = stdout | ||
|
||
try: | ||
|
||
maf_file = snakemake.input[0] | ||
|
||
regions_file = snakemake.input[1] | ||
regions_format = snakemake.params[0] | ||
|
||
metadata = snakemake.params[1] | ||
|
||
output_file = snakemake.output[0] | ||
|
||
# Return empty dataframe if no lines in MAF | ||
line_count = count_lines(maf_file) | ||
if line_count == 1: | ||
empty_maf = pd.read_table(maf_file, comment="#", sep="\t") | ||
# Add columns required by workflow | ||
required_columns = ["seq_type","genome_build","chr_std"] | ||
empty_maf = empty_maf.assign(**{col:None for col in required_columns if col not in empty_maf.columns}) | ||
write_output(empty_maf, output_file) | ||
exit() | ||
|
||
maf = maf_add_columns(maf=maf_file, metadata=metadata, wildcards=snakemake.wildcards) | ||
|
||
# Perform filtering | ||
filtered_maf = maf_filter( | ||
maf=maf, | ||
regions=regions_file, | ||
regions_format=regions_format | ||
) | ||
|
||
write_output(filtered_maf, output_file) | ||
|
||
except Exception as e: | ||
logging.error(e, exc_info=1) | ||
raise | ||
|
||
def count_lines(maf): | ||
with open(maf, "r") as handle: | ||
total_lines = len(handle.readlines()) | ||
return total_lines | ||
|
||
def filter_by_bed(maf, regions): | ||
|
||
# Remove row containing column names | ||
regions = regions[regions[0].str.contains("chrom")==False] | ||
|
||
# Create common columns between BED and MAF | ||
regions["chr_std"] = regions.apply(lambda x: "chr" + str(x[0]).replace("chr",""), axis=1) | ||
regions["genomic_pos_std"] = regions["chr_std"] + ":" + regions[1].map(str) | ||
|
||
maf["chr_std"] = maf.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) | ||
maf["genomic_pos_std"] = maf["chr_std"] + ":" + maf["Start_Position"].map(str) | ||
|
||
filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] | ||
return filtered_maf | ||
|
||
def filter_by_maf(maf, regions): | ||
|
||
# Create common column by which to subset MAF | ||
for df in [maf, regions]: | ||
df["chr_std"] = df.apply(lambda x: "chr" + str(x["Chromosome"]).replace("chr",""), axis=1) | ||
df["genomic_pos_std"] = df["chr_std"] + ":" + df["Start_Position"].map(str) | ||
|
||
# Subset the MAF | ||
filtered_maf = maf[maf["genomic_pos_std"].isin(regions["genomic_pos_std"])] | ||
return filtered_maf | ||
|
||
def maf_filter(maf, regions, regions_format): | ||
|
||
if regions_format != "bed": | ||
regions_df = pd.read_table(regions, comment="#", sep="\t") | ||
else: | ||
regions_df = pd.read_table(regions, comment="#", sep="\t", header=None) | ||
|
||
# Return empty dataframe without filtering if df is empty | ||
if len(maf)==0: | ||
return maf | ||
|
||
filter_functions = { | ||
"maf": filter_by_maf, | ||
"bed": filter_by_bed | ||
} | ||
|
||
return filter_functions[regions_format](maf, regions_df) | ||
|
||
def maf_add_columns(maf, metadata, wildcards): | ||
# Read input MAF as df | ||
maf = pd.read_table(maf, comment="#", sep="\t") | ||
|
||
sample_id = snakemake.wildcards["tumour_id"] | ||
seq_type = snakemake.wildcards["seq_type"] | ||
genome_build = snakemake.wildcards["genome_build"] | ||
normal_sample_id = snakemake.wildcards["normal_sample_id"] | ||
pair_status = snakemake.wildcards["pair_status"] | ||
|
||
maf["seq_type"] = seq_type | ||
maf["genome_build"] = genome_build | ||
maf["normal_sample_id"] = normal_sample_id | ||
maf["pair_status"] = pair_status | ||
|
||
return maf | ||
|
||
def write_output(maf, outfile): | ||
maf.to_csv(outfile, sep="\t", na_rep="NA", index=False) | ||
|
||
if __name__ == "__main__": | ||
logging.basicConfig( | ||
level=logging.DEBUG, | ||
filename=snakemake.log[1], | ||
filemode='w' | ||
) | ||
|
||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if nothing is specified here? Can we add the
"__UPDATE__"
to anything that has to be filled in?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added an
"__UPDATE__"
string and specified that at least one regions file must be provided