-
Notifications
You must be signed in to change notification settings - Fork 2
/
helpers.py
182 lines (155 loc) · 5.38 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import re
import sys
import pandas as pd
import matplotlib.pyplot as plt
from subprocess import call
from seaborn import scatterplot
from interop import py_interop_run_metrics, py_interop_run, py_interop_table
from numpy import zeros, float32
from bclqc import MAIN_PASSES
# This file includes helper functions used by bclqc.py
def trim_slash(path):
return path[:-1] if path[-1] == '/' else path
def get_run_name(run_dir: str):
"""
Given '.../221013_A01718_0014_AHNYGGDRX2',
returns '221013_A01718_0014_AHNYGGDRX2'
"""
split_fields = run_dir.split('/')
if split_fields[-1] == '':
return split_fields[-2]
else:
return split_fields[-1]
def get_arg_dirs():
if len(sys.argv) < 4:
raise Exception("Usage: python3 bclqc.py [options] fastqs_dir bams_dir run_dir")
run_dir = sys.argv[-1]
bams_dir = sys.argv[-2]
fastqs_dir = sys.argv[-3]
run_name = get_run_name(run_dir)
dirs = [trim_slash(d) for d in [fastqs_dir, bams_dir, run_dir]]
for i, dir in enumerate(dirs):
if run_name not in dir:
dirs[i] = f"{dir}/{run_name}"
return dirs
def get_cli_args():
return parse_args(sys.argv[1:-3])
def get_custom_passes():
return get_cli_args().get('P')
def get_passes():
custom_passes = get_custom_passes()
return custom_passes if custom_passes else MAIN_PASSES
def print_cmd(cmd):
print(" ".join(cmd) + "\n")
def get_exec_cmd():
if '-d' in sys.argv or '--dry-run' in sys.argv:
return print_cmd
else:
return call
def exec_pass(script_name, *args):
call(["bash", f"scripts/{script_name}.sh", *args])
def is_samplesheet(file_name):
return re.search("^SampleSheet_.*\.csv$", file_name)
def get_index(file_name):
# assumes samplesheet format is "SampleSheet_{index}.csv"
return file_name[12:-4]
def get_indices(run_dir):
indices = [get_index(f) for f in os.listdir(run_dir) if is_samplesheet(f)]
if not indices:
raise("No samplesheets found in " + run_dir)
return indices
def get_sample_ids(fastq_list):
fastq_list_df = pd.read_csv(fastq_list)
return set(fastq_list_df['RGSM'])
# creates a dict mapping command line flags to their arguments
# e.x. {'P': ['demux', 'align']}
def parse_args(cli_args):
args_dict = {}
current_flag = ""
for arg in cli_args:
if arg.startswith('-'):
current_flag = arg[1:]
args_dict[current_flag] = []
else:
args_dict[current_flag] += [arg]
return args_dict
def get_custom_passes():
"""
Returns the list of custom passes from the command line.
Assumes usage: `python3 bclqc.py -P pass1 pass2 ... fastqs_dir bams_dir run_dir`
"""
args = parse_args(sys.argv[1:-3])
return args.get('P')
def parse_run_metrics(run_dir: str):
"""
Returns a DataFrame of `interop_imaging_table` or returns None if no data is found.
More info on `interop_imaging_table` and the `interop` library:
http://illumina.github.io/interop/index.html
https://github.com/Illumina/interop
"""
# Initialize interop objects
run_metrics = py_interop_run_metrics.run_metrics()
valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
valid_to_load[py_interop_run.ExtendedTile] = 1
valid_to_load[py_interop_run.Tile] = 1
valid_to_load[py_interop_run.Extraction] = 1
# Read from the run folder
try:
run_metrics.read(run_dir, valid_to_load)
except:
print(f"Error occured trying to open {run_dir}")
return None
# Set up data table
columns = py_interop_table.imaging_column_vector()
py_interop_table.create_imaging_table_columns(run_metrics, columns)
ncolumns = columns.size()
if ncolumns == 0: return None # no data
headers = []
for i in range(ncolumns):
column = columns[i]
if column.has_children():
headers.extend(
[f"{column.name()} ({subname})" for subname in column.subcolumns()])
else:
headers.append(column.name())
column_count = py_interop_table.count_table_columns(columns)
row_offsets = py_interop_table.map_id_offset()
py_interop_table.count_table_rows(run_metrics, row_offsets)
data = zeros((row_offsets.size(), column_count), dtype=float32)
# Populate table
py_interop_table.populate_imaging_table_data(
run_metrics, columns, row_offsets, data.ravel()
)
return pd.DataFrame(data, columns=headers)
def save_occ_pf_plot(run_dir, output_dir, exec_cmd=call):
"""
Saves a % Occupied x % Pass Filter scatter to `SAVE_DIR`.
"""
df = parse_run_metrics(run_dir)
if df is None:
print("Unable to parse Interop files.")
return
x = "% Occupied"
y = "% Pass Filter"
views = ["Lane"] # can add "Tile" or "Cycle" for more views
for view in views:
scatterplot(
data=df,
x=x,
y=y,
hue=view,
alpha=0.5,
s=8,
)
plt.xlim([0, 100])
plt.ylim([50, 100])
plt.legend(title=view, bbox_to_anchor=[1.2, 0.9])
plt.tight_layout()
image_path = output_dir + f"/occ_pf_{view.lower()}_mqc.jpg"
if exec_cmd == call:
print("saving occ pf graph to " + image_path)
plt.savefig(image_path, dpi=300)
else:
print("DRY RUN: saving occ pf graph to " + image_path)
plt.close()