From 001b5a3fae333cab5a2a939c3902805a1b0483a1 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 26 Jan 2019 19:09:31 -0800 Subject: [PATCH 1/7] fixes for pytorch1 --- neusomatic/python/call.py | 3 +-- neusomatic/python/dataloader.py | 6 ++++-- neusomatic/python/train.py | 13 ++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index d66bda9..d6d02c3 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -383,8 +383,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, vartype_classes = ['DEL', 'INS', 'NONE', 'SNP'] data_transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) num_channels = 119 if ensemble else 26 net = NeuSomaticNet(num_channels) if use_cuda: diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index a976997..35ee0ee 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -388,9 +388,11 @@ def __getitem__(self, index): non_transformed_matrix = np.array(orig_matrix).astype(np.uint8) else: non_transformed_matrix = [] - + + matrix = torch.from_numpy(matrix.transpose((2, 0, 1))) + matrix = matrix.float().div(255) if self.transform is not None: - matrix = self.transform(matrix) + matrix = self.transform(matrix) var_pos = [length, center] var_pos = torch.Tensor(var_pos) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 3f86e07..3c97194 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -101,7 +101,7 @@ def test(net, epoch, validation_loader, use_cuda): for i in range(len(labels)): label = labels[i] - class_correct[label] += compare_labels[i] + class_correct[label] += compare_labels[i].data.cpu().numpy() class_total[label] += 1 for i in range(len(predicted)): label = predicted[i] @@ -110,7 +110,7 @@ def test(net, epoch, validation_loader, use_cuda): compare_len = (len_pred == var_len_s).squeeze() for i in range(len(var_len_s)): len_ = var_len_s[i] - len_class_correct[len_] += compare_len[i] + len_class_correct[len_] += compare_len[i].data.cpu().numpy() len_class_total[len_] += 1 for i in range(len(len_pred)): len_ = len_pred[i] @@ -186,8 +186,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo torch.set_num_threads(num_threads) data_transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) num_channels = 119 if ensemble else 26 net = NeuSomaticNet(num_channels) @@ -396,14 +395,14 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo if use_cuda: var_len_labels = var_len_labels.cuda() loss = criterion_crossentropy(outputs_classification, labels) + 1 * criterion_smoothl1( - outputs_pos, var_pos_s[:, 1] + outputs_pos.squeeze(1), var_pos_s[:, 1] ) + 1 * criterion_crossentropy2(outputs_len, var_len_labels) loss.backward() optimizer.step() - loss_s.append(loss.data[0]) + loss_s.append(loss.data) - running_loss += loss.data[0] + running_loss += loss.data if i_ % print_freq == print_freq - 1: logger.info('epoch: {}, i: {:>5}, lr: {}, loss: {:.5f}'.format( epoch + 1 + prev_epochs, i_ + 1, From 0c3b8b8f699cb54c4c41afa4265d9956e6497cd1 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 30 Jan 2019 12:17:03 -0800 Subject: [PATCH 2/7] Convert to Python 3 --- README.md | 29 +- neusomatic/python/call.py | 47 ++-- neusomatic/python/dataloader.py | 19 +- .../python/extract_postprocess_targets.py | 4 +- neusomatic/python/filter_candidates.py | 39 +-- neusomatic/python/generate_dataset.py | 263 +++++++++--------- neusomatic/python/long_read_indelrealign.py | 114 ++++---- neusomatic/python/merge_tsvs.py | 10 +- neusomatic/python/network.py | 10 +- neusomatic/python/postprocess.py | 8 +- neusomatic/python/preprocess.py | 8 +- neusomatic/python/resolve_scores.py | 16 +- neusomatic/python/resolve_variants.py | 47 ++-- neusomatic/python/scan_alignments.py | 10 +- neusomatic/python/split_bed.py | 4 +- neusomatic/python/train.py | 34 +-- neusomatic/python/utils.py | 4 +- test/NeuSomatic_ensemble.vcf | 10 +- test/NeuSomatic_standalone.vcf | 10 +- 19 files changed, 350 insertions(+), 336 deletions(-) diff --git a/README.md b/README.md index 8b80d88..77c28d5 100644 --- a/README.md +++ b/README.md @@ -29,33 +29,32 @@ doi: https://doi.org/10.1101/393801](https://doi.org/10.1101/393801) ## Availability -NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 0.3.1 to enable GPU acceleration for training/testing. +NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 1.0.0 to enable GPU acceleration for training/testing. NeuSomatic first scans the genome to identify candidate variants and extract alignment information. -The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.12.1 and g++ 5.4.0). +The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.13.2 and g++ 5.4.0). Python 2.7 and the following Python packages must be installed: -* pytorch 0.3.1 -* torchvision 0.2.0 -* pybedtools 0.7.10 -* pysam 0.14.1 +* pytorch 1.0.0 +* torchvision 0.2.1 +* pybedtools 0.8.0 +* pysam 0.15.2 * zlib 1.2.11 -* numpy 1.14.3 -* scipy 1.1.0 -* biopython 1.68 +* numpy 1.15.4 +* scipy 1.2.0 +* biopython 1.73 It also depends on the following packages: * cuda80 1.0 (if you want to use GPU) -* tabix 0.2.5 +* tabix 0.2.6 * bedtools 2.27.1 -* samtools 1.7 +* samtools 1.9 You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) : ``` -conda install zlib=1.2.11 numpy=1.14.3 scipy=1.1.0 -conda install pytorch=0.3.1 torchvision=0.2.0 cuda80=1.0 -c pytorch -conda install cmake=3.12.1 -c conda-forge -conda install pysam=0.14.1 pybedtools=0.7.10 samtools=1.7 tabix=0.2.5 bedtools=2.27.1 biopython=1.68 -c bioconda +conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 +conda install pysam=0.15.2 pybedtools=0.8.0 samtools=1.9 tabix=0.2.6 bedtools=2.27.1 biopython=1.73 -c bioconda +conda install pytorch=1.0.0 torchvision=0.2.1 cuda80==1.0 -c pytorch ``` Then you can export the conda paths as: ``` diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index e5d3e71..5b945e0 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -79,31 +79,32 @@ def call_variants(net, vartype_classes, call_loader, out_dir, model_tag, use_cud imsave(file_name, non_transformed_matrices[i, :, :, 0:3]) true_path[path] = file_name final_preds[path] = [vartype_classes[predicted[i]], pos_pred[i], len_pred[i], - map(lambda x: round(x, 4), F.softmax( - outputs1[i, :], 0).data.cpu().numpy()), - map(lambda x: round(x, 4), F.softmax( - outputs3[i, :], 0).data.cpu().numpy()), - map(lambda x: round(x, 4), - outputs1.data.cpu()[i].numpy()), - map(lambda x: round(x, 4), - outputs3.data.cpu()[i].numpy())] + list(map(lambda x: round(x, 4), F.softmax( + outputs1[i, :], 0).data.cpu().numpy())), + list(map(lambda x: round(x, 4), F.softmax( + outputs3[i, :], 0).data.cpu().numpy())), + list(map(lambda x: round(x, 4), + outputs1.data.cpu()[i].numpy())), + list(map(lambda x: round(x, 4), + outputs3.data.cpu()[i].numpy()))] else: none_preds[path] = [vartype_classes[predicted[i]], pos_pred[i], len_pred[i], - map(lambda x: round(x, 4), F.softmax( - outputs1[i, :], 0).data.cpu().numpy()), - map(lambda x: round(x, 4), F.softmax( - outputs3[i, :], 0).data.cpu().numpy()), - map(lambda x: round(x, 4), - outputs1.data.cpu()[i].numpy()), - map(lambda x: round(x, 4), - outputs3.data.cpu()[i].numpy())] + list(map(lambda x: round(x, 4), F.softmax( + outputs1[i, :], 0).data.cpu().numpy())), + list(map(lambda x: round(x, 4), F.softmax( + outputs3[i, :], 0).data.cpu().numpy())), + list(map(lambda x: round(x, 4), + outputs1.data.cpu()[i].numpy())), + list(map(lambda x: round(x, 4), + outputs3.data.cpu()[i].numpy()))] if (iii % 10 == 0): logger.info("Called {} candidates in this batch.".format(j)) logger.info("Called {} candidates in this batch.".format(j)) return final_preds, none_preds, true_path -def pred_vcf_records_path((path, true_path_, pred_all, chroms, vartype_classes, ref_file)): +def pred_vcf_records_path(record): + path, true_path_, pred_all, chroms, vartype_classes, ref_file = record thread_logger = logging.getLogger( "{} ({})".format(pred_vcf_records_path.__name__, multiprocessing.current_process().name)) try: @@ -196,7 +197,7 @@ def pred_vcf_records_path((path, true_path_, pred_all, chroms, vartype_classes, b = (anchor[0] - col_2_pos[anchor[1]]) for i in nzref_pos: col_2_pos[i] += b - pos_2_col = {v: k for k, v in col_2_pos.iteritems()} + pos_2_col = {v: k for k, v in col_2_pos.items()} if abs(center_pred - center) < too_far_center: if type_pred == "SNP": @@ -311,7 +312,7 @@ def pred_vcf_records(ref_file, final_preds, true_path, chroms, vartype_classes, if o is None: raise Exception("pred_vcf_records_path failed!") - all_vcf_records = filter(None, all_vcf_records) + all_vcf_records = list(filter(None, all_vcf_records)) return all_vcf_records @@ -348,7 +349,7 @@ def get_vcf_records(all_vcf_records): def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_threshold): logger = logging.getLogger(write_vcf.__name__) - vcf_records = filter(lambda x: len(x) > 0, vcf_records) + vcf_records = list(filter(lambda x: len(x) > 0, vcf_records)) vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]]) lines = [] with open(output_vcf, "w") as ov: @@ -417,10 +418,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, # 1. filter out unnecessary keys # pretrained_state_dict = { # k: v for k, v in pretrained_state_dict.items() if k in model_dict} - if "module." in pretrained_state_dict.keys()[0] and "module." not in model_dict.keys()[0]: + if "module." in list(pretrained_state_dict.keys())[0] and "module." not in list(model_dict.keys())[0]: pretrained_state_dict = {k.split("module.")[1]: v for k, v in pretrained_state_dict.items( ) if k.split("module.")[1] in model_dict} - elif "module." not in pretrained_state_dict.keys()[0] and "module." in model_dict.keys()[0]: + elif "module." not in list(pretrained_state_dict.keys())[0] and "module." in list(model_dict.keys())[0]: pretrained_state_dict = { ("module." + k): v for k, v in pretrained_state_dict.items() if ("module." + k) in model_dict} @@ -434,7 +435,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, Ls = [] for candidate_file in candidates_tsv: - idx = pickle.load(open(candidate_file + ".idx")) + idx = pickle.load(open(candidate_file + ".idx", "rb")) Ls.append(len(idx) - 1) current_L = 0 diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index f77c337..06ece78 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -36,7 +36,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i): tag = fields[2] im = extract_zlib(base64.b64decode(fields[3])) if len(fields) > 4: - anns = map(float, fields[4:]) + anns = list(map(float, fields[4:])) else: anns = [] label = type_class_dict[tag.split(".")[4]] @@ -45,7 +45,8 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i): return tag, im, anns, label -def extract_info_tsv((i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l)): +def extract_info_tsv(record): + i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l = record thread_logger = logging.getLogger( "{} ({})".format(extract_info_tsv.__name__, multiprocessing.current_process().name)) try: @@ -89,7 +90,7 @@ def extract_info_tsv((i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasse im = extract_zlib(base64.b64decode(fields[3])) label = type_class_dict[tag.split(".")[4]] if len(fields) > 4: - anns = map(float, fields[4:]) + anns = list(map(float, fields[4:])) else: anns = [] data.append([tag, im, anns, label]) @@ -147,7 +148,7 @@ def __init__(self, roots, max_load_candidates, transform=None, self.tsvs.append(tsv) for t in range(num_threads): self.open_tsvs[t].append("") - idx = pickle.load(open(tsv + ".idx")) + idx = pickle.load(open(tsv + ".idx", "rb")) self.idxs.append(idx) self.Ls.append(len(idx) - 1) self.data = [] @@ -167,7 +168,7 @@ def __init__(self, roots, max_load_candidates, transform=None, Ls_ = [] for i_b, _ in batch: tsv = self.tsvs[i_b] - max_load_ = self.Ls[i_b] * max_load_candidates / \ + max_load_ = self.Ls[i_b] * max_load_candidates // \ total_L if total_L > 0 else 0 map_args.append([i_b, tsv, self.idxs[i_b], self.Ls[i_b], max_load_, nclasses_t, nclasses_l]) @@ -198,8 +199,8 @@ def __init__(self, roots, max_load_candidates, transform=None, for matrices, data, none_ids, var_ids, count_class_t, count_class_l in records_: self.matrices += matrices self.data += data - self.none_ids += map(lambda x: x + j, none_ids) - self.var_ids += map(lambda x: x + j, var_ids) + self.none_ids += list(map(lambda x: x + j, none_ids)) + self.var_ids += list(map(lambda x: x + j, var_ids)) for k in range(nclasses_t): self.count_class_t[k] += count_class_t[k] for k in range(nclasses_l): @@ -271,12 +272,12 @@ def __getitem__(self, index): h, w, c = matrix.shape r = random.rand() if r < 0.6: - x_left = random.randint((center - 2) * 2 / 3, center - 2) + x_left = random.randint((center - 2) * 2 // 3, center - 2) else: x_left = 0 if r > 0.4: x_right = random.randint( - (w - center - 2) * 2 / 3, w - center - 2) + (w - center - 2) * 2 // 3, w - center - 2) else: x_right = 0 if x_left > 0: diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py index 1fa9c94..7547583 100755 --- a/neusomatic/python/extract_postprocess_targets.py +++ b/neusomatic/python/extract_postprocess_targets.py @@ -34,7 +34,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): if not record_set: record_set.append(record) continue - if len(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set)) > 0: + if len(list(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set))) > 0: record_set.append(record) continue @@ -45,7 +45,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): for ii, record_set in enumerate(record_sets): if len(record_set) > 1: - if filter(lambda x: len(x[2]) != len(x[3]), record_set): + if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)): for x in record_set: fields = x[-1].strip().split() fields[2] = str(ii) diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py index fd98e33..9c2c310 100755 --- a/neusomatic/python/filter_candidates.py +++ b/neusomatic/python/filter_candidates.py @@ -15,9 +15,10 @@ from utils import safe_read_info_dict -def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, max_dp, good_ao, - min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, - del_merge_min_af, ins_merge_min_af, merge_r)): +def filter_candidates(candidate_record): + candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, max_dp, good_ao, \ + min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, \ + del_merge_min_af, ins_merge_min_af, merge_r = candidate_record thread_logger = logging.getLogger( "{} ({})".format(filter_candidates.__name__, multiprocessing.current_process().name)) try: @@ -35,7 +36,7 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp chrom, pos, _, ref, alt, _, _, info_, _, info = line.strip().split() pos = int(pos) loc = "{}.{}".format(chrom, pos) - dp, ro, ao = map(int, info.split(":")[1:4]) + dp, ro, ao = list(map(int, info.split(":")[1:4])) info_dict = dict(map(lambda x: x.split( "="), filter(None, info_.split(";")))) mq_ = safe_read_info_dict(info_dict, "MQ", int, -100) @@ -71,15 +72,15 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp fasta_file = pysam.Fastafile(reference) good_records = [] dels = [] - for loc, rs in sorted(records.iteritems(), key=lambda x: x[1][0:2]) + \ + for loc, rs in sorted(records.items(), key=lambda x: x[1][0:2]) + \ [["", [["", 0, "", "", 0, 0, 0, ""]]]]: - ins = filter(lambda x: x[2] == "N", rs) + ins = list(filter(lambda x: x[2] == "N", rs)) if len(ins) > 1: # emit ins - afs = map(lambda x: x[6] / float(x[5] + x[6]), ins) + afs = list(map(lambda x: x[6] / float(x[5] + x[6]), ins)) max_af = max(afs) - ins = filter(lambda x: x[6] / float(x[5] + - x[6]) >= (max_af * merge_r), ins) + ins = list(filter(lambda x: x[6] / float(x[5] + + x[6]) >= (max_af * merge_r), ins)) chrom, pos, ref = ins[0][0:3] dp = max(map(lambda x: x[4], ins)) ro = max(map(lambda x: x[5], ins)) @@ -109,7 +110,7 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp else: ins = [ins[0][:-1]] good_records.extend(ins) - if dels and (ins or filter(lambda x: x[3] != "N" and x[2] != "N", rs)): + if dels and (ins or list(filter(lambda x: x[3] != "N" and x[2] != "N", rs))): # emit del if len(dels) == 1: ro = dels[0][5] @@ -119,11 +120,11 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp good_records.extend(dels) else: - afs = map(lambda x: x[6] / float(x[5] + x[6]), dels) + afs = list(map(lambda x: x[6] / float(x[5] + x[6]), dels)) max_af = max(afs) merge_r_thr = merge_r * max_af - dels = filter( - lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) + dels = list(filter( + lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels)) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) @@ -172,12 +173,12 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp if ao / float(ro + ao) >= ((del_min_af)): good_records.extend(dels) else: - afs = map(lambda x: x[6] / - float(x[5] + x[6]), dels) + afs = list(map(lambda x: x[6] / + float(x[5] + x[6]), dels)) max_af = max(afs) merge_r_thr = merge_r * max_af - dels = filter( - lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) + dels = list(filter( + lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels)) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) @@ -277,8 +278,8 @@ def filter_candidates((candidates_vcf, filtered_candidates_vcf, reference, dbsnp non_in_dbsnp_ids.append(int(x[5])) for x in non_in_dbsnp_2: non_in_dbsnp_ids.append(int(x[5])) - final_records = map(lambda x: x[1], filter( - lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))) + final_records = list(map(lambda x: x[1], filter( + lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records)))) with open(filtered_candidates_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write( diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 8525b36..04b4260 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -26,8 +26,8 @@ NUC_to_NUM = {"a": 1, "c": 2, "g": 3, "t": 4, "-": 5, " ": 0, "A": 1, "C": 2, "G": 3, "T": 4, "N": 5, "b": 1, "d": 2, "h": 3, "u": 4} snp = {"a": "b", "c": "d", "g": "h", "t": "u"} -NUM_to_NUC = {v: k for k, v in NUC_to_NUM.iteritems()} -NUM_to_NUC_hp = {v: k for k, v in NUC_to_NUM_hp.iteritems()} +NUM_to_NUC = {v: k for k, v in NUC_to_NUM.items()} +NUM_to_NUC_hp = {v: k for k, v in NUC_to_NUM_hp.items()} def get_type(ref, alt): @@ -115,14 +115,14 @@ def get_variant_matrix_tabix(ref_file, count_bed, record, matrix_base_pad, chrom col_pos_map[pos_] = cnt cnt += 1 curr_pos = pos_ + 1 - matrix_.append(map(int, rec[4].split(":"))) - bq_matrix_.append(map(int, rec[5].split(":"))) - mq_matrix_.append(map(int, rec[6].split(":"))) - st_matrix_.append(map(int, rec[7].split(":"))) - lsc_matrix_.append(map(int, rec[8].split(":"))) - rsc_matrix_.append(map(int, rec[9].split(":"))) + matrix_.append(list(map(int, rec[4].split(":")))) + bq_matrix_.append(list(map(int, rec[5].split(":")))) + mq_matrix_.append(list(map(int, rec[6].split(":")))) + st_matrix_.append(list(map(int, rec[7].split(":")))) + lsc_matrix_.append(list(map(int, rec[8].split(":")))) + rsc_matrix_.append(list(map(int, rec[9].split(":")))) for iii in range(len(tag_matrices_)): - tag_matrices_[iii].append(map(int, rec[10 + iii].split(":"))) + tag_matrices_[iii].append(list(map(int, rec[10 + iii].split(":")))) ref_array.append(NUC_to_NUM_tabix[ref_base]) if ref_base != "-" and pos_ not in col_pos_map: col_pos_map[pos_] = cnt @@ -189,9 +189,9 @@ def align_tumor_normal_matrices(record, tumor_matrix_, bq_tumor_matrix_, mq_tumo raise(RuntimeError( "tumor_col_pos_map and normal_col_pos_map have different keys.")) - pT = map(lambda x: tumor_col_pos_map[x], sorted(tumor_col_pos_map.keys())) - pN = map(lambda x: normal_col_pos_map[ - x], sorted(normal_col_pos_map.keys())) + pT = list(map(lambda x: tumor_col_pos_map[x], sorted(tumor_col_pos_map.keys()))) + pN = list(map(lambda x: normal_col_pos_map[ + x], sorted(normal_col_pos_map.keys()))) if pT[0] != pN[0]: logger.error("record: {}".format(record)) @@ -275,9 +275,9 @@ def align_tumor_normal_matrices(record, tumor_matrix_, bq_tumor_matrix_, mq_tumo new_normal_ref_array[map_N] = normal_ref_array new_tumor_col_pos_map = {k: map_T[v] - for k, v in tumor_col_pos_map.iteritems()} + for k, v in tumor_col_pos_map.items()} new_normal_col_pos_map = {k: map_N[v] - for k, v in normal_col_pos_map.iteritems()} + for k, v in normal_col_pos_map.items()} if sum(new_normal_ref_array - new_tumor_ref_array) != 0: logger.error("record: {}".format(record)) @@ -382,8 +382,8 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec a = [-1000] + sorted(np.where((ref_array == 0))[0]) + [1000] b = np.diff((np.diff(a) == 1).astype(int)) a = a[1:-1] - blocks = map(lambda x: [a[x[0]], a[x[1]]], zip( - np.where(b == 1)[0], np.where(b == -1)[0])) + blocks = list(map(lambda x: [a[x[0]], a[x[1]]], zip( + np.where(b == 1)[0], np.where(b == -1)[0]))) if blocks: largest_block = sorted( blocks, key=lambda x: x[1] - x[0] + 1)[-1] @@ -398,19 +398,19 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec ) - set(cols_not_to_del))) if n_col - len(cols_to_del) > tw: mn = min(count_column[np.argsort(count_column)][ - n_col - tw - 1], max(n_row / 5, min_ev_frac_per_col * n_row)) + n_col - tw - 1], max(n_row // 5, min_ev_frac_per_col * n_row)) new_cols_to_del = set(np.where(np.logical_and(count_column <= mn, (ref_array == 0)))[ 0]) - set(cols_to_del) - set(cols_not_to_del) if n_col - (len(cols_to_del) + len(new_cols_to_del)) < tw and len(new_cols_to_del) > 3 and len(alt) > len(ref): - new_cols_to_del = map( - lambda x: [count_column[x], x], new_cols_to_del) + new_cols_to_del = list(map( + lambda x: [count_column[x], x], new_cols_to_del)) new_cols_to_del = sorted(new_cols_to_del, key=lambda x: [x[0], x[1]])[ 0:len(new_cols_to_del) - 4] - new_cols_to_del = map(lambda x: x[1], new_cols_to_del) + new_cols_to_del = list(map(lambda x: x[1], new_cols_to_del)) cols_to_del = sorted(set(list(new_cols_to_del) + list(cols_to_del))) cols_to_del = list(set(cols_to_del)) if n_col - len(cols_to_del) > tw: - for i in range(n_col / 10): + for i in range(n_col // 10): if i not in cols_to_del and sum(rcenter) > -3: ref_b = ref_array[i] if tumor_matrix_[ref_b, i] > .8 * n_row: @@ -434,7 +434,7 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec set(range(largest_block[0], largest_block[1] + 1))))) cols_to_del.sort() - for i, v in col_pos_map.iteritems(): + for i, v in col_pos_map.items(): col_pos_map[i] -= sum(np.array(cols_to_del) < v) tumor_matrix = np.delete(tumor_matrix_, cols_to_del, 1) @@ -464,60 +464,60 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec ncols = tumor_matrix.shape[1] if matrix_width >= ncols: - col_pos_map = {i: v + (matrix_width - ncols) / - 2 for i, v in col_pos_map.iteritems()} + col_pos_map = {i: v + (matrix_width - ncols) // + 2 for i, v in col_pos_map.items()} tumor_count_matrix = np.zeros((5, matrix_width)) - tumor_count_matrix[:, (matrix_width - ncols) / - 2:(matrix_width - ncols) / 2 + ncols] = tumor_matrix + tumor_count_matrix[:, (matrix_width - ncols) // + 2:(matrix_width - ncols) // 2 + ncols] = tumor_matrix bq_tumor_count_matrix = np.zeros((5, matrix_width)) bq_tumor_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = bq_tumor_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = bq_tumor_matrix mq_tumor_count_matrix = np.zeros((5, matrix_width)) mq_tumor_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = mq_tumor_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = mq_tumor_matrix st_tumor_count_matrix = np.zeros((5, matrix_width)) st_tumor_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = st_tumor_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = st_tumor_matrix lsc_tumor_count_matrix = np.zeros((5, matrix_width)) lsc_tumor_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = lsc_tumor_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = lsc_tumor_matrix rsc_tumor_count_matrix = np.zeros((5, matrix_width)) rsc_tumor_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = rsc_tumor_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = rsc_tumor_matrix tag_tumor_count_matrices = [] for iii in range(len(tag_tumor_matrices)): tag_tumor_count_matrices.append(np.zeros((5, matrix_width))) tag_tumor_count_matrices[iii][ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = tag_tumor_matrices[iii] + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = tag_tumor_matrices[iii] normal_count_matrix = np.zeros((5, matrix_width)) normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = normal_matrix bq_normal_count_matrix = np.zeros((5, matrix_width)) bq_normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = bq_normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = bq_normal_matrix mq_normal_count_matrix = np.zeros((5, matrix_width)) mq_normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = mq_normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = mq_normal_matrix st_normal_count_matrix = np.zeros((5, matrix_width)) st_normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = st_normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = st_normal_matrix lsc_normal_count_matrix = np.zeros((5, matrix_width)) lsc_normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = lsc_normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = lsc_normal_matrix rsc_normal_count_matrix = np.zeros((5, matrix_width)) rsc_normal_count_matrix[ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = rsc_normal_matrix + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = rsc_normal_matrix tag_normal_count_matrices = [] for iii in range(len(tag_normal_matrices)): tag_normal_count_matrices.append(np.zeros((5, matrix_width))) tag_normal_count_matrices[iii][ - :, (matrix_width - ncols) / 2:(matrix_width - ncols) / 2 + ncols] = tag_normal_matrices[iii] + :, (matrix_width - ncols) // 2:(matrix_width - ncols) // 2 + ncols] = tag_normal_matrices[iii] ref_count_matrix = np.zeros((5, matrix_width)) - ref_count_matrix[:, (matrix_width - ncols) / - 2:(matrix_width - ncols) / 2 + ncols] = ref_matrix + ref_count_matrix[:, (matrix_width - ncols) // + 2:(matrix_width - ncols) // 2 + ncols] = ref_matrix else: col_pos_map = {i: int(round(v / float(ncols) * matrix_width)) - for i, v in col_pos_map.iteritems()} + for i, v in col_pos_map.items()} tumor_count_matrix = imresize( tumor_matrix, (5, matrix_width)).astype(int) bq_tumor_count_matrix = imresize( @@ -568,8 +568,11 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec tag_normal_count_matrices, center, rlen, col_pos_map] -def prep_data_single_tabix((ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, - matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths)): +def prep_data_single_tabix(input_record): + + ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, \ + matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths = input_record + thread_logger = logging.getLogger( "{} ({})".format(prep_data_single_tabix.__name__, multiprocessing.current_process().name)) try: @@ -648,7 +651,7 @@ def prep_data_single_tabix((ref_file, tumor_count_bed, normal_count_bed, record, candidate_mat, 255)).astype(np.uint8) tag = "{}.{}.{}.{}.{}.{}.{}.{}.{}".format(ch_order, pos, ref[0:55], alt[ 0:55], vartype, center, rlen, tumor_cov, normal_cov) - candidate_mat = base64.b64encode(zlib.compress(candidate_mat)) + candidate_mat = base64.b64encode(zlib.compress(candidate_mat)).decode('ascii') return tag, candidate_mat, record[0:4], ann except Exception as ex: thread_logger.error(traceback.format_exc()) @@ -714,8 +717,8 @@ def push_lr(fasta_file, record, left_right_both): new_ref, new_alt] + record[4:]) record = [chrom, new_pos - len(new_ref), new_ref, new_alt] + record[4:] - eqs = map(lambda x: eqs[x], dict( - map(lambda x: ["_".join(map(str, x[1])), x[0]], enumerate(eqs))).values()) + eqs = list(map(lambda x: eqs[x], dict( + map(lambda x: ["_".join(map(str, x[1])), x[0]], enumerate(eqs))).values())) for eq in eqs: c, p, r, a = eq[0:4] @@ -731,7 +734,7 @@ def merge_records(fasta_file, records): if len(records) == 1: return records[0][0:4] chrom = records[0][0] - pos_s = map(lambda x: x[1], records) + pos_s = list(map(lambda x: x[1], records)) pos_m = min(pos_s) - 1 b = pos_m ref2_ = "" @@ -821,7 +824,8 @@ def find_len(ref, alt): return max(len(ref_), len(alt_)) -def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, work_index)): +def find_records(input_record): + work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, work_index = input_record thread_logger = logging.getLogger( "{} ({})".format(find_records.__name__, multiprocessing.current_process().name)) try: @@ -850,12 +854,13 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi pybedtools.BedTool(ensemble_bed).intersect( split_bed, u=True).saveas(split_ensemble_bed_file) pybedtools.BedTool(split_ensemble_bed_file).window(split_pred_vcf_file, w=5, v=True).each( - lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + 1, x[3], x[4], ".", otherfields=[".", ".", ".", "."])).saveas(split_missed_ensemble_bed_file) + lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + 1, x[3], x[4], + ".", otherfields=[".".encode('utf-8'), ".".encode('utf-8'), ".".encode('utf-8'), ".".encode('utf-8')])).saveas(split_missed_ensemble_bed_file) concatenate_vcfs( [split_pred_vcf_file, split_missed_ensemble_bed_file], split_pred_with_missed_file) pred_with_missed = pybedtools.BedTool(split_pred_with_missed_file).each( lambda x: pybedtools.create_interval_from_list( - [x[0], str(x[1]), ".", x[3], x[4], ".", ".", ".", ".", "."])).sort().saveas( + [x[0], str(x[1]), ".", x[3], x[4], ".".encode('utf-8'), ".".encode('utf-8'), ".".encode('utf-8'), ".".encode('utf-8'), ".".encode('utf-8')])).sort().saveas( split_pred_with_missed_file) not_in_ensemble_bed = pybedtools.BedTool( pred_with_missed).window(split_ensemble_bed_file, w=1, v=True) @@ -933,11 +938,11 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi len(record_[14]) > len(record_[4]) and record_[14][0:len(record_[4])] == record_[4]): ann = record_[15:] if ann: - ann = map(float, ann) + ann = list(map(float, ann)) rrs.append([r_, ann]) max_ann = max(map(lambda x: sum(x[1]), rrs)) if max_ann > 0: - rrs = filter(lambda x: sum(x[1]) > 0, rrs) + rrs = list(filter(lambda x: sum(x[1]) > 0, rrs)) elif max_ann == 0: rrs = rrs[0:1] for r_, ann in rrs: @@ -970,7 +975,7 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi records.append(rr + [str(i)]) i += 1 records_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval( - x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), records)) + x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), records)).saveas() truth_records = [] i = 0 @@ -984,9 +989,9 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi i += 1 truth_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval( - x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), truth_records)) + x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), truth_records)).saveas() none_records_0 = records_bed.window(truth_bed, w=5, v=True) - none_records_ids = map(lambda x: int(x[5]), none_records_0) + none_records_ids = list(map(lambda x: int(x[5]), none_records_0)) other_records = records_bed.window(truth_bed, w=5) map_pred_2_truth = {} map_truth_2_pred = {} @@ -1008,7 +1013,7 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi good_records = {"INS": [], "DEL": [], "SNP": []} vtype = {} record_len = {} - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] for j in js: record = records[j] @@ -1022,11 +1027,11 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi good_records[vartype].append(j) vtype[j] = vartype - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) done_js = list(good_records_idx) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx) or set(js) & set(done_js): continue @@ -1045,7 +1050,7 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi mt = merge_records(fasta_file, t_) if mt: mt2, eqs2 = push_lr(fasta_file, mt, 2) - eqs2 = map(lambda x: "_".join(map(str, x[0:4])), eqs2) + eqs2 = list(map(lambda x: "_".join(map(str, x[0:4])), eqs2)) for idx_jj, jj in enumerate(js): for n_merge_j in range(0, len(js) - idx_jj): r_j = js[idx_jj:idx_jj + n_merge_j + 1] @@ -1073,8 +1078,8 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi if not set(js) - set(done_js_): done = True - perfect_idx = [i for w in good_records.values() for i in w] - good_records_idx = [i for w in good_records.values() for i in w] + perfect_idx = [i for w in list(good_records.values()) for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) for j in remained_idx: @@ -1115,10 +1120,10 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi if done: break - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx): continue @@ -1138,10 +1143,10 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi good_records[vartype].append(j) vtype[j] = vartype - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx): continue @@ -1159,10 +1164,10 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi record_len[j] = find_len(ref_t, alt_t) record_center[j] = rc - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx): @@ -1184,10 +1189,10 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi record_len[j] = find_len(ref_t, alt_t) record_center[j] = rc - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx): continue @@ -1203,16 +1208,16 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi record_center[j] = find_i_center(ref, alt) record_len[j] = find_len(ref_t, alt_t) - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.iteritems(): + for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] if set(js) & set(good_records_idx): continue - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) for j in remained_idx: @@ -1225,15 +1230,15 @@ def find_records((work, split_region_file, truth_vcf_file, pred_vcf_file, ref_fi vtype[j] = "NONE" record_len[j] = 0 - good_records_idx = [i for w in good_records.values() for i in w] + good_records_idx = [i for w in list(good_records.values()) for i in w] records_r = [records[x] for k, w in good_records.items() for x in w] records_r_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval( - x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), records_r)) + x[0], x[1], x[1] + len(x[2]), x[2], x[3], x[4]), records_r)).saveas() N_none = len(none_records_ids) thread_logger.info("N_none: {} ".format(N_none)) - none_records = map(lambda x: records[x], none_records_ids) + none_records = list(map(lambda x: records[x], none_records_ids)) none_records = sorted(none_records, key=lambda x: [x[0], int(x[1])]) return records_r, none_records, vtype, record_len, record_center, chroms_order, anns @@ -1259,11 +1264,11 @@ def extract_ensemble(work, ensemble_tsv): fields = line.strip().split() fields[2] = str(int(fields[1]) + len(fields[3])) ensemble_pos.append(fields[0:5]) - ensemble_data.append(map(lambda x: float( - x.replace("False", "0").replace("True", "1")), fields[5:105])) + ensemble_data.append(list(map(lambda x: float( + x.replace("False", "0").replace("True", "1")), fields[5:105]))) ensemble_data = np.array(ensemble_data) - cov_features = map(lambda x: x[0], filter(lambda x: x[1] in [ + cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", @@ -1274,48 +1279,48 @@ def extract_ensemble(work, ensemble_tsv): "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", - ], enumerate(header))) - mq_features = map(lambda x: x[0], filter(lambda x: x[1] in [ - "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header))) - bq_features = map(lambda x: x[0], filter(lambda x: x[1] in [ - "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header))) - nm_diff_features = map(lambda x: x[0], filter( - lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header))) - ranksum_features = map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ", - "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ", "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header))) - zero_to_one_features = map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", + ], enumerate(header)))) + mq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header)))) + bq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header)))) + nm_diff_features = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header)))) + ranksum_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ", + "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ", "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header)))) + zero_to_one_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_Strelka"] + ["nBAM_Concordance_FET", "nBAM_StrandBias_FET", "nBAM_Clipping_FET", - "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header))) - stralka_scor = map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_Score"], enumerate(header))) - stralka_qss = map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_QSS"], enumerate(header))) - stralka_tqss = map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_TQSS"], enumerate(header))) - varscan2_score = map(lambda x: x[0], filter( - lambda x: x[1] in ["VarScan2_Score"], enumerate(header))) - vardict_score = map(lambda x: x[0], filter( - lambda x: x[1] in ["VarDict_Score"], enumerate(header))) - m2_lod = map(lambda x: x[0], filter(lambda x: x[1] in [ - "M2_NLOD", "M2_TLOD"], enumerate(header))) - sniper_score = map(lambda x: x[0], filter( - lambda x: x[1] in ["Sniper_Score"], enumerate(header))) - m2_ecent = map(lambda x: x[0], filter( - lambda x: x[1] in ["M2_ECNT"], enumerate(header))) - sor = map(lambda x: x[0], filter( - lambda x: x[1] in ["SOR"], enumerate(header))) - msi = map(lambda x: x[0], filter( - lambda x: x[1] in ["MSI"], enumerate(header))) - msilen = map(lambda x: x[0], filter( - lambda x: x[1] in ["MSILEN"], enumerate(header))) - shift3 = map(lambda x: x[0], filter( - lambda x: x[1] in ["SHIFT3"], enumerate(header))) - MaxHomopolymer_Length = map(lambda x: x[0], filter( - lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header))) - SiteHomopolymer_Length = map(lambda x: x[0], filter( - lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header))) - InDel_Length = map(lambda x: x[0], filter( - lambda x: x[1] in ["InDel_Length"], enumerate(header))) + "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header)))) + stralka_scor = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_Score"], enumerate(header)))) + stralka_qss = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_QSS"], enumerate(header)))) + stralka_tqss = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_TQSS"], enumerate(header)))) + varscan2_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["VarScan2_Score"], enumerate(header)))) + vardict_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["VarDict_Score"], enumerate(header)))) + m2_lod = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "M2_NLOD", "M2_TLOD"], enumerate(header)))) + sniper_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Sniper_Score"], enumerate(header)))) + m2_ecent = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["M2_ECNT"], enumerate(header)))) + sor = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SOR"], enumerate(header)))) + msi = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MSI"], enumerate(header)))) + msilen = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MSILEN"], enumerate(header)))) + shift3 = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SHIFT3"], enumerate(header)))) + MaxHomopolymer_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header)))) + SiteHomopolymer_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header)))) + InDel_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["InDel_Length"], enumerate(header)))) min_max_features = [[cov_features, 0, 2 * COV], [mq_features, 0, 70], @@ -1340,7 +1345,7 @@ def extract_ensemble(work, ensemble_tsv): [InDel_Length, -30, 30], ] selected_features = sorted([i for f in min_max_features for i in f[0]]) - selected_features_tags = map(lambda x: header[x], selected_features) + selected_features_tags = list(map(lambda x: header[x], selected_features)) for i_s, mn, mx in min_max_features: s = ensemble_data[:, np.array(i_s)] s = np.maximum(np.minimum(s, mx), mn) @@ -1387,7 +1392,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be len_candids += len(pybedtools.BedTool(ensemble_bed) .intersect(region_bed_file, u=True)) logger.info("len_candids: {}".format(len_candids)) - num_splits = max(len_candids / split_batch_size, num_threads) + num_splits = max(len_candids // split_batch_size, num_threads) split_region_files = split_region( work, region_bed_file, num_splits, shuffle_intervals=True) @@ -1421,8 +1426,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be for records_r, none_records, vtype, record_len, record_center, chroms_order, anns in records_data: total_ims += len(records_r) + len(none_records) - candidates_split = int(total_ims / tsv_batch_size) + 1 - is_split = total_ims / candidates_split + candidates_split = int(total_ims // tsv_batch_size) + 1 + is_split = total_ims // candidates_split with open(var_vcf, "w") as vv, open(none_vcf, "w") as nv: is_current = 0 is_ = -1 @@ -1519,8 +1524,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be vv.write("\t".join([record[0], str(record[1]), ".", record[2], record[ 3], ".", ".", "TAG={};".format(tag), ".", "."]) + "\n") tsv_idx.append(b_o.tell()) - b_o.write("\t".join([str(cnt_ims), "1", tag, compressed_candidate_mat] + map( - lambda x: str(np.round(x, 4)), ann)) + "\n") + b_o.write("\t".join([str(cnt_ims), "1", tag, compressed_candidate_mat] + list(map( + lambda x: str(np.round(x, 4)), ann))) + "\n") cnt_ims += 1 for x in none_records_done: if x: @@ -1528,11 +1533,11 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be nv.write("\t".join([record[0], str(record[1]), ".", record[2], record[ 3], ".", ".", "TAG={};".format(tag), ".", "."]) + "\n") tsv_idx.append(b_o.tell()) - b_o.write("\t".join([str(cnt_ims), "1", tag, compressed_candidate_mat] + map( - lambda x: str(np.round(x, 4)), ann)) + "\n") + b_o.write("\t".join([str(cnt_ims), "1", tag, compressed_candidate_mat] + list(map( + lambda x: str(np.round(x, 4)), ann))) + "\n") cnt_ims += 1 tsv_idx.append(b_o.tell()) - pickle.dump(tsv_idx, open(candidates_tsv_file + ".idx", "w")) + pickle.dump(tsv_idx, open(candidates_tsv_file + ".idx", "wb")) is_current = is_end if is_current >= total_ims: break diff --git a/neusomatic/python/long_read_indelrealign.py b/neusomatic/python/long_read_indelrealign.py index 37c0c54..1f2da92 100755 --- a/neusomatic/python/long_read_indelrealign.py +++ b/neusomatic/python/long_read_indelrealign.py @@ -15,6 +15,7 @@ import logging from random import shuffle import csv +import functools import numpy as np import pybedtools @@ -125,8 +126,8 @@ def fix_record(self, record, ref_seq): bias = 0 for region_start, region_end, start_idx, end_idx, del_start, del_end, \ pos_start, pos_end, new_cigar, excess_start, excess_end in self.realignments: - c_array = np.array(map(lambda x: [x[0], x[1][1] if x[1][0] - != CIGAR_DEL else 0], enumerate(cigartuples))) + c_array = np.array(list(map(lambda x: [x[0], x[1][1] if x[1][0] + != CIGAR_DEL else 0], enumerate(cigartuples)))) c_map = np.repeat(c_array[:, 0], c_array[:, 1]) c_i = c_map[start_idx - bias] @@ -197,7 +198,7 @@ def fix_record(self, record, ref_seq): if end_hc: new_cigartuples_list.append([[CIGAR_HARDCLIP, end_hc]]) - new_cigartuples = reduce( + new_cigartuples = functools.reduce( lambda x, y: merge_cigartuples(x, y), new_cigartuples_list) if len(new_cigartuples) > 2 and new_cigartuples[-1][0] == CIGAR_SOFTCLIP \ and new_cigartuples[-2][0] == CIGAR_DEL: @@ -222,8 +223,8 @@ def fix_record(self, record, ref_seq): record.cigarstring = cigartuple_to_string(new_cigartuples) NM = find_NM(record, ref_seq) - record.tags = filter( - lambda x: x[0] != "NM", record.tags) + [("NM", int(NM))] + record.tags = list(filter( + lambda x: x[0] != "NM", record.tags)) + [("NM", int(NM))] return record @@ -239,8 +240,8 @@ def get_cigar_stat(cigartuple, keys=[]): def find_NM(record, ref_seq): logger = logging.getLogger(find_NM.__name__) - positions = np.array(map(lambda x: x if x else -1, - (record.get_reference_positions(full_length=True)))) + positions = np.array(list(map(lambda x: x if x else -1, + (record.get_reference_positions(full_length=True))))) sc_start = (record.cigartuples[0][0] == CIGAR_SOFTCLIP) * record.cigartuples[0][1] sc_end = (record.cigartuples[-1][0] == @@ -255,9 +256,9 @@ def find_NM(record, ref_seq): non_ins_positions = positions[non_ins] mn, mx = min(non_ins_positions), max(non_ins_positions) refseq = ref_seq.get_seq(mn, mx + 1) - ref_array = np.array(map(lambda x: NUC_to_NUM[x.upper()], list(refseq)))[ + ref_array = np.array(list(map(lambda x: NUC_to_NUM[x.upper()], list(refseq))))[ non_ins_positions - mn] - q_array = np.array(map(lambda x: NUC_to_NUM[x.upper()], list(q_seq)))[ + q_array = np.array(list(map(lambda x: NUC_to_NUM[x.upper()], list(q_seq))))[ non_ins] cigar_stat = get_cigar_stat(record.cigartuples, [CIGAR_DEL, CIGAR_INS]) assert ref_array.shape[0] == q_array.shape[0] @@ -297,13 +298,13 @@ def prepare_fasta(work, region, input_bam, ref_fasta_file, include_ref, split_i) for record in samfile.fetch(region.chrom, region.start, region.end + 1): if record.is_supplementary and "SA" in dict(record.tags): sas = dict(record.tags)["SA"].split(";") - sas = filter(None, sas) - sas_cigs = map(lambda x: x.split(",")[3], sas) + sas = list(filter(None, sas)) + sas_cigs = list(map(lambda x: x.split(",")[3], sas)) if record.cigarstring in sas_cigs: continue - positions = np.array(map(lambda x: x if x else -1, + positions = np.array(list(map(lambda x: x if x else -1, (record.get_reference_positions( - full_length=True)))) + full_length=True))))) if not record.cigartuples: continue sc_start = (record.cigartuples[0][0] == @@ -326,9 +327,9 @@ def prepare_fasta(work, region, input_bam, ref_fasta_file, include_ref, split_i) 0][0] + sc_start if max(end_idx - start_idx, pos_end - pos_start) >= (region.span() * 0.75): - c_array = np.array(map(lambda x: [x[0], x[1][1] if x[1][0] + c_array = np.array(list(map(lambda x: [x[0], x[1][1] if x[1][0] != CIGAR_DEL else 0], - enumerate(record.cigartuples))) + enumerate(record.cigartuples)))) c_map = np.repeat(c_array[:, 0], c_array[:, 1]) c_i = c_map[start_idx] c_e = c_map[end_idx] @@ -344,10 +345,10 @@ def prepare_fasta(work, region, input_bam, ref_fasta_file, include_ref, split_i) non_ins_positions = positions_[non_ins] mn, mx = min(non_ins_positions), max( non_ins_positions) - ref_array = np.array(map(lambda x: + ref_array = np.array(list(map(lambda x: NUC_to_NUM[x.upper()], - list(refseq)))[non_ins_positions - mn] - q_array = np.array(map(lambda x: NUC_to_NUM[x.upper()], list(q_seq)))[ + list(refseq))))[non_ins_positions - mn] + q_array = np.array(list(map(lambda x: NUC_to_NUM[x.upper()], list(q_seq))))[ non_ins] cigar_stat = get_cigar_stat( my_cigartuples, [CIGAR_DEL, CIGAR_INS]) @@ -375,13 +376,13 @@ def split_bam_to_chuncks(work, region, input_bam, chunck_size=200, for record in samfile.fetch(region.chrom, region.start, region.end + 1): if record.is_supplementary and "SA" in dict(record.tags): sas = dict(record.tags)["SA"].split(";") - sas = filter(None, sas) - sas_cigs = map(lambda x: x.split(",")[3], sas) + sas = list(filter(None, sas)) + sas_cigs = list(map(lambda x: x.split(",")[3], sas)) if record.cigarstring in sas_cigs: continue positions = np.array( - map(lambda x: x if x else -1, (record.get_reference_positions(full_length=True)))) + list(map(lambda x: x if x else -1, (record.get_reference_positions(full_length=True))))) if not record.cigartuples: continue @@ -402,16 +403,16 @@ def split_bam_to_chuncks(work, region, input_bam, chunck_size=200, records.append([record, len(q_seq)]) - records = map(lambda x: x[0], sorted(records, key=lambda x: x[1])) + records = list(map(lambda x: x[0], sorted(records, key=lambda x: x[1]))) if len(records) < chunck_size * chunck_scale: bams = [input_bam] lens = [len(records)] else: - n_splits = int(max(6, len(records) / chunck_size)) - new_chunck_size = len(records) / n_splits + n_splits = int(max(6, len(records) // chunck_size)) + new_chunck_size = len(records) // n_splits bams = [] lens = [] - n_split = (len(records) / new_chunck_size) + 1 + n_split = (len(records) // new_chunck_size) + 1 if 0 < (len(records) - ((n_split - 1) * new_chunck_size) + new_chunck_size) \ < new_chunck_size * chunck_scale: n_split -= 1 @@ -440,7 +441,7 @@ def split_bam_to_chuncks(work, region, input_bam, chunck_size=200, def read_info(info_file): logger = logging.getLogger(read_info.__name__) info = {} - with open(info_file, 'rb') as csvfile: + with open(info_file, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|') for row in spamreader: info[int(row[0])] = Read_Info(row[1:]) @@ -459,7 +460,7 @@ def find_cigar(alignment): raise Exception event_type = augmented_alignment[:-1][event_pos[1:]] - cigartuple = zip(event_type, event_len) + cigartuple = list(zip(event_type, event_len)) return cigartuple_to_string(cigartuple) @@ -476,10 +477,10 @@ def extract_new_cigars(region, info_file, out_fasta_file): set(map(int, records.keys())) ^ set(range(len(records))))) raise Exception - alignment = map(lambda x: x[1], sorted(map(lambda x: [int(x[0]), map(lambda x: 0 if x == "-" - else 1, x[1].seq)], + alignment = list(map(lambda x: x[1], sorted(map(lambda x: [int(x[0]), list(map(lambda x: 0 if x == "-" + else 1, x[1].seq))], records.items()), - key=lambda x: x[0])) + key=lambda x: x[0]))) ref_seq = np.array(alignment[0]) pos_ref = np.cumsum(alignment[0]) - 1 alignment = np.array(alignment[1:]) - ref_seq @@ -527,9 +528,9 @@ def nuc_to_num_convert(nuc): if nuc.upper() not in NUC_to_NUM.keys(): nuc = "-" return NUC_to_NUM[nuc.upper()] - for i, record in records.iteritems(): + for i, record in records.items(): ii = int(i) - 1 - msa[ii] = map(lambda x: nuc_to_num_convert(x), record.seq) + msa[ii] = list(map(lambda x: nuc_to_num_convert(x), record.seq)) msa = np.array(msa, dtype=int) consensus = [] for i in range(align_len): @@ -558,11 +559,11 @@ def get_final_msa(region, msa_0, consensus, out_fasta_file_1, out_fasta_file_fin raise Exception align_len = len(records["0"].seq) msa_1 = [[] for i in range(2)] - for i, record in records.iteritems(): + for i, record in records.items(): ii = int(i) - msa_1[ii] = map(lambda x: nuc_to_num_convert(x), record.seq) + msa_1[ii] = list(map(lambda x: nuc_to_num_convert(x), record.seq)) msa_1 = np.array(msa_1, dtype=int) - consensus_array = np.array(map(lambda x: nuc_to_num_convert(x), consensus)) + consensus_array = np.array(list(map(lambda x: nuc_to_num_convert(x), consensus))) consensus_cumsum = np.cumsum(consensus_array > 0) new_cols = np.where(msa_1[1, :] == 0)[0] new_cols -= np.arange(len(new_cols)) @@ -622,7 +623,7 @@ def merge_cigartuples(tuple1, tuple2): def find_realign_dict(realign_bed_file, chrom): logger = logging.getLogger(find_realign_dict.__name__) realign_bed = pybedtools.BedTool( - realign_bed_file).filter(lambda x: x[0] == chrom) + realign_bed_file).filter(lambda x: x[0] == chrom).saveas() realign_dict = {} chrom_regions = set([]) for interval in realign_bed: @@ -638,11 +639,12 @@ def find_realign_dict(realign_bed_file, chrom): excess_end) chrom_regions.add("{}-{}".format(start, end)) chrom_regions = sorted( - map(lambda x: map(int, x.split("-")), chrom_regions)) + map(lambda x: list(map(int, x.split("-"))), chrom_regions)) return realign_dict, chrom_regions -def correct_bam_chrom((work, input_bam, realign_bed_file, ref_fasta_file, chrom)): +def correct_bam_chrom(input_record): + work, input_bam, realign_bed_file, ref_fasta_file, chrom = input_record thread_logger = logging.getLogger( "{} ({})".format(correct_bam_chrom.__name__, multiprocessing.current_process().name)) try: @@ -729,7 +731,7 @@ def correct_bam_all(work, input_bam, output_bam, ref_fasta_file, realign_bed_fil def concatenate_sam_files(files, output, bam_header): logger = logging.getLogger(concatenate_sam_files.__name__) - good_files = filter(lambda x: x and os.path.isfile(x), files) + good_files = list(filter(lambda x: x and os.path.isfile(x), files)) fin = fileinput.input(good_files) with open(output, "w") as merged_fd: with open(bam_header) as bh_fd: @@ -831,9 +833,9 @@ def find_var(out_fasta_file, snp_min_af, del_min_af, ins_min_af, scale_maf): logger.error("sequences are missing in the alignment {}".format( set(map(int, records.keys())) ^ set(range(len(records))))) raise Exception - alignment = np.array(map(lambda x: x[1], sorted(map(lambda x: [int(x[0]), map( - lambda x: NUC_to_NUM[x.upper()], x[1].seq)], records.items()), - key=lambda x: x[0]))) + alignment = np.array(list(map(lambda x: x[1], sorted(map(lambda x: [int(x[0]), list(map( + lambda x: NUC_to_NUM[x.upper()], x[1].seq))], records.items()), + key=lambda x: x[0])))) ref_seq = alignment[0, :] counts = np.zeros((5, alignment.shape[1])) for i in range(5): @@ -897,10 +899,11 @@ def TrimREFALT(ref, alt, pos): return ref, alt, pos -def run_realignment((work, ref_fasta_file, target_region, pad, chunck_size, chunck_scale, - snp_min_af, del_min_af, ins_min_af, len_chr, input_bam, - match_score, mismatch_penalty, gap_open_penalty, gap_ext_penalty, - msa_binary, get_var)): +def run_realignment(input_record): + work, ref_fasta_file, target_region, pad, chunck_size, chunck_scale, \ + snp_min_af, del_min_af, ins_min_af, len_chr, input_bam, \ + match_score, mismatch_penalty, gap_open_penalty, gap_ext_penalty, \ + msa_binary, get_var = input_record thread_logger = logging.getLogger( "{} ({})".format(run_realignment.__name__, multiprocessing.current_process().name)) @@ -1200,8 +1203,8 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f region_bed_merged = region_bed_merged_tmp region_bed_merged.saveas(os.path.join(work, "regions_merged.bed")) - target_regions = map( - lambda x: [x[0], int(x[1]), int(x[2])], region_bed_merged) + target_regions = list(map( + lambda x: [x[0], int(x[1]), int(x[2])], region_bed_merged)) get_var = True if output_vcf else False pool = multiprocessing.Pool(num_threads) @@ -1227,10 +1230,10 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f if o is None: raise Exception("run_realignment failed!") - realign_entries = map(lambda x: x[0], realign_output) + realign_entries = list(map(lambda x: x[0], realign_output)) - realign_variants = map(lambda x: x[1], realign_output) - realign_variants = filter(None, realign_variants) + realign_variants = list(map(lambda x: x[1], realign_output)) + realign_variants = list(filter(None, realign_variants)) if get_var: with open(output_vcf, "w") as o_f: @@ -1254,14 +1257,13 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f pybedtools.set_tempdir(pybedtmp) if realign_entries: - realign_entries = reduce(lambda x, y: x + y, realign_entries) + realign_entries = functools.reduce(lambda x, y: x + y, realign_entries) realign_bed_file = os.path.join(work, "realign.bed") realign_entries.sort() relaign_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], x[1], - x[2], x[ - 3], - otherfields=map(str, x[4:])), - realign_entries)).saveas(realign_bed_file) + x[2],x[3], + otherfields=list(map(lambda y:str(y).encode('utf-8'), x[4:]))), + realign_entries)).saveas(realign_bed_file) relaign_bed = pybedtools.BedTool(realign_bed_file) diff --git a/neusomatic/python/merge_tsvs.py b/neusomatic/python/merge_tsvs.py index 76609cd..9be9aef 100755 --- a/neusomatic/python/merge_tsvs.py +++ b/neusomatic/python/merge_tsvs.py @@ -48,7 +48,7 @@ def merge_tsvs(input_tsvs, out, totla_L = 0 for tsv in input_tsvs: - totla_L += len(pickle.load(open(tsv + ".idx"))) - 1 + totla_L += len(pickle.load(open(tsv + ".idx", "rb"))) - 1 totla_L = max(0, totla_L) candidates_per_tsv = max(candidates_per_tsv, np.ceil( totla_L / float(max_num_tsvs)) + 1) @@ -73,7 +73,7 @@ def merge_tsvs(input_tsvs, out, none_f.write(line) if len(none_idx) >= candidates_per_tsv: none_idx.append(none_f.tell()) - pickle.dump(none_idx, open(none_file + ".idx", "w")) + pickle.dump(none_idx, open(none_file + ".idx", "wb")) none_f.close() logger.info( "Done with merge_id: {}".format(len(merged_tsvs))) @@ -90,7 +90,7 @@ def merge_tsvs(input_tsvs, out, var_f.write(line) if len(var_idx) >= candidates_per_tsv: var_idx.append(var_f.tell()) - pickle.dump(var_idx, open(var_file + ".idx", "w")) + pickle.dump(var_idx, open(var_file + ".idx", "wb")) var_f.close() logger.info( "Done with merge_id: {}".format(len(merged_tsvs))) @@ -102,13 +102,13 @@ def merge_tsvs(input_tsvs, out, var_idx = [] if not var_f.closed: var_idx.append(var_f.tell()) - pickle.dump(var_idx, open(var_file + ".idx", "w")) + pickle.dump(var_idx, open(var_file + ".idx", "wb")) var_f.close() logger.info("Done with merge_id: {}".format(len(merged_tsvs))) merged_tsvs.append(var_file) if not keep_none_types and not none_f.closed: none_idx.append(none_f.tell()) - pickle.dump(none_idx, open(none_file + ".idx", "w")) + pickle.dump(none_idx, open(none_file + ".idx", "wb")) none_f.close() logger.info("Done with merge_id: {}".format(len(merged_tsvs))) merged_tsvs.append(none_file) diff --git a/neusomatic/python/network.py b/neusomatic/python/network.py index 35ad45b..da685b7 100755 --- a/neusomatic/python/network.py +++ b/neusomatic/python/network.py @@ -20,13 +20,13 @@ def __init__(self, dim, ks_1=3, ks_2=3, dl_1=1, dl_2=1, mp_ks=3, mp_st=1): super(NSBlock, self).__init__() self.dim = dim self.conv_r1 = nn.Conv2d( - dim, dim, kernel_size=ks_1, dilation=dl_1, padding=(dl_1 * (ks_1 - 1)) / 2) + dim, dim, kernel_size=ks_1, dilation=dl_1, padding=(dl_1 * (ks_1 - 1)) // 2) self.bn_r1 = nn.BatchNorm2d(dim) self.conv_r2 = nn.Conv2d( - dim, dim, kernel_size=ks_2, dilation=dl_2, padding=(dl_2 * (ks_2 - 1)) / 2) + dim, dim, kernel_size=ks_2, dilation=dl_2, padding=(dl_2 * (ks_2 - 1)) // 2) self.bn_r2 = nn.BatchNorm2d(dim) self.pool_r2 = nn.MaxPool2d((1, mp_ks), padding=( - 0, (mp_ks - 1) / 2), stride=(1, mp_st)) + 0, (mp_ks - 1) // 2), stride=(1, mp_st)) def forward(self, x): y1 = (F.relu(self.bn_r1(self.conv_r1(x)))) @@ -56,8 +56,8 @@ def __init__(self, num_channels): rb = NSBlock(dim, ks_1, ks_2, dl_1, dl_2, mp_ks, mp_st) res_layers.append(rb) self.res_layers = nn.Sequential(*res_layers) - ds = np.prod(map(lambda x: x[5], self.nsblocks)) - self.fc_dim = dim * 32 * 5 / ds + ds = np.prod(list(map(lambda x: x[5], self.nsblocks))) + self.fc_dim = dim * 32 * 5 // ds self.fc1 = nn.Linear(self.fc_dim, 240) self.fc2 = nn.Linear(240, 4) self.fc3 = nn.Linear(240, 1) diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 878b70f..87834f2 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -86,7 +86,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, if tag not in tags_info: tags_info[tag] = [] info = x[19].split(":") - dp, ro, ao = map(int, info[1:4]) + dp, ro, ao = list(map(int, info[1:4])) af = float(info[4]) is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14] is_same_type = np.sign( @@ -96,7 +96,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, tags_info[tag].append( [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af]) fina_info_tag = {} - for tag, hits in tags_info.iteritems(): + for tag, hits in tags_info.items(): hits = sorted(hits, key=lambda x: x[0:5]) fina_info_tag[tag] = hits[0][5:] @@ -105,8 +105,8 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, fina_info_tag[tag] = [0, 0, 0, 0] scores[tag] = [x[5], x[6], x[7], x[9]] - tags = sorted(fina_info_tag.keys(), key=lambda x: map(int, x.split("-")[0:2] - )) + tags = sorted(fina_info_tag.keys(), key=lambda x: list(map(int, x.split("-")[0:2] + ))) with open(output_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write("##NeuSomatic Version={}\n".format(__version__)) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 40d297b..ca1dd25 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -22,7 +22,8 @@ from utils import concatenate_vcfs -def split_dbsnp((restart, dbsnp, region_bed, dbsnp_region_vcf)): +def split_dbsnp(record): + restart, dbsnp, region_bed, dbsnp_region_vcf = record thread_logger = logging.getLogger( "{} ({})".format(split_dbsnp.__name__, multiprocessing.current_process().name)) try: @@ -115,7 +116,7 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp dbsnp_regions.append(dbsnp_region_vcf) else: filtered_candidates_vcfs = None - return map(lambda x: x[1], scan_outputs), map(lambda x: x[2], scan_outputs), filtered_candidates_vcfs, dbsnp_regions + return list(map(lambda x: x[1], scan_outputs)), list(map(lambda x: x[2], scan_outputs)), filtered_candidates_vcfs, dbsnp_regions def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, @@ -126,7 +127,8 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi tsv_batch_size) -def get_ensemble_region((reference, ensemble_bed, region, ensemble_bed_region_file, matrix_base_pad)): +def get_ensemble_region(record): + reference, ensemble_bed, region, ensemble_bed_region_file, matrix_base_pad = record thread_logger = logging.getLogger( "{} ({})".format(get_ensemble_region.__name__, multiprocessing.current_process().name)) try: diff --git a/neusomatic/python/resolve_scores.py b/neusomatic/python/resolve_scores.py index 297e02a..a2897c6 100755 --- a/neusomatic/python/resolve_scores.py +++ b/neusomatic/python/resolve_scores.py @@ -35,7 +35,7 @@ def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf): intervals_dict[id_] = [] intervals_dict[id_].append(interval) - for id_, intervals in intervals_dict.iteritems(): + for id_, intervals in intervals_dict.items(): if len(intervals) == 1: score = intervals[0][15] interval = intervals[0][:10] @@ -45,15 +45,15 @@ def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf): else: len_ = (len(intervals[0][4]) - len(intervals[0][3])) pos_ = int(intervals[0][1]) - len_diff = map(lambda x: abs( - (len(x[14]) - len(x[13])) - len_), intervals) + len_diff = list(map(lambda x: abs( + (len(x[14]) - len(x[13])) - len_), intervals)) min_len_diff = min(len_diff) - intervals = filter(lambda x: abs( - (len(x[14]) - len(x[13])) - len_) == min_len_diff, intervals) - pos_diff = map(lambda x: abs(int(x[11]) - pos_), intervals) + intervals = list(filter(lambda x: abs( + (len(x[14]) - len(x[13])) - len_) == min_len_diff, intervals)) + pos_diff = list(map(lambda x: abs(int(x[11]) - pos_), intervals)) min_pos_diff = min(pos_diff) - intervals = filter(lambda x: abs( - int(x[11]) - pos_) == min_pos_diff, intervals) + intervals = list(filter(lambda x: abs( + int(x[11]) - pos_) == min_pos_diff, intervals)) score = "{:.4f}".format( np.round(max(map(lambda x: float(x[15]), intervals)), 4)) interval = intervals[0][:10] diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index 20332fe..39181d8 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -70,19 +70,20 @@ def extract_ins(record): return inss -def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): +def find_resolved_variants(input_record): + chrom, start, end, variants, input_bam, reference = input_record thread_logger = logging.getLogger( "{} ({})".format(find_resolved_variants.__name__, multiprocessing.current_process().name)) try: ref = pysam.FastaFile(reference) out_variants = [] - start, end = map(int, [start, end]) + start, end = list(map(int, [start, end])) region = [chrom, start, end] - vartypes = map(lambda x: x[-1], variants) - scores = map(lambda x: x[5], variants) + vartypes = list(map(lambda x: x[-1], variants)) + scores = list(map(lambda x: x[5], variants)) if len(set(vartypes)) > 1: out_variants.extend( - map(lambda x: [x[0], int(x[1]), x[3], x[4], x[10], x[5]], variants)) + list(map(lambda x: [x[0], int(x[1]), x[3], x[4], x[10], x[5]], variants))) else: vartype = vartypes[0] score = max(scores) @@ -93,13 +94,13 @@ def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): for record in samfile.fetch(chrom, start, end): if record.cigarstring and "D" in record.cigarstring: dels.extend(extract_del(record)) - dels = filter(lambda x: ( - start <= x[1] <= end) or start <= x[2] <= end, dels) + dels = list(filter(lambda x: ( + start <= x[1] <= end) or start <= x[2] <= end, dels)) if dels: - intervals = map(lambda x: pybedtools.Interval( - x[0], x[1], x[2]), dels) - bed = pybedtools.BedTool(intervals) - del_strs = map(lambda x: "---".join(x[0:3]), bed) + intervals = list(map(lambda x: pybedtools.Interval( + x[0], x[1], x[2]), dels)) + bed = pybedtools.BedTool(intervals).saveas() + del_strs = list(map(lambda x: "---".join(x[0:3]), bed)) uniq_dels = list(set(del_strs)) uniq_dels_count = {} for del_ in uniq_dels: @@ -109,10 +110,10 @@ def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): if uniq_dels_count[del_] <= max_count * 0.5: del uniq_dels_count[del_] new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2])), - map(lambda x: x.split("---"), uniq_dels_count.keys()))) + list(map(lambda x: x.split("---"), uniq_dels_count.keys())))).saveas() new_bed = new_bed.sort().merge(c=[1], o="count") - out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( - x[1]) - 1, int(x[2])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])), "0/1", score], new_bed)) + out_variants.extend(list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( + x[1]) - 1, int(x[2])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])), "0/1", score], new_bed))) elif vartype == "INS": intervals = [] inss = [] @@ -120,13 +121,13 @@ def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): for record in samfile.fetch(chrom, start, end): if record.cigarstring and "I" in record.cigarstring: inss.extend(extract_ins(record)) - inss = filter(lambda x: ( - start <= x[1] <= end) or start <= x[2] <= end, inss) + inss = list(filter(lambda x: ( + start <= x[1] <= end) or start <= x[2] <= end, inss)) if inss: - intervals = map(lambda x: pybedtools.Interval( - x[0], x[1], x[2], x[3]), inss) - bed = pybedtools.BedTool(intervals) - ins_strs = map(lambda x: "---".join(x[0:4]), bed) + intervals = list(map(lambda x: pybedtools.Interval( + x[0], x[1], x[2], x[3]), inss)) + bed = pybedtools.BedTool(intervals).saveas() + ins_strs = list(map(lambda x: "---".join(x[0:4]), bed)) uniq_inss = list(set(ins_strs)) uniq_inss_count = {} for ins_ in uniq_inss: @@ -138,9 +139,9 @@ def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): if uniq_inss_count[ins_] <= max_count * 0.5 or 0 < abs(int(ins_.split("---")[1]) - max_pos) < 4: del uniq_inss_count[ins_] new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2]), x[3]), - map(lambda x: x.split("---"), uniq_inss_count.keys()))).sort() - out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( - x[1]) - 1, int(x[1])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])) + x[3], "0/1", score], new_bed)) + list(map(lambda x: x.split("---"), uniq_inss_count.keys())))).sort() + out_variants.extend(list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( + x[1]) - 1, int(x[1])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])) + x[3], "0/1", score], new_bed))) return out_variants except Exception as ex: thread_logger.error(traceback.format_exc()) diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index c25e91f..820adb4 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -24,8 +24,10 @@ from split_bed import split_region -def run_scan_alignments((work, reference, scan_alignments_binary, split_region_file, - input_bam, window_size, maf, min_mapq, max_dp, calc_qual, num_threads)): +def run_scan_alignments(record): + work, reference, scan_alignments_binary, split_region_file, \ + input_bam, window_size, maf, min_mapq, max_dp, calc_qual, num_threads = record + thread_logger = logging.getLogger( "{} ({})".format(run_scan_alignments.__name__, multiprocessing.current_process().name)) try: @@ -80,7 +82,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam, with pysam.AlignmentFile(input_bam, "rb") as samfile: for chrom, length in zip(samfile.references, samfile.lengths): intervals.append(pybedtools.Interval(chrom, 1, length - 1)) - regions_bed = pybedtools.BedTool(intervals) + regions_bed = pybedtools.BedTool(intervals).saveas() if not os.path.exists(work): os.mkdir(work) total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed)) @@ -98,7 +100,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam, regions_bed_file = os.path.join(work, "all_regions.bed") regions_bed.saveas(regions_bed_file) - num_split = max(int(np.ceil((total_len / 10000000) / + num_split = max(int(np.ceil((total_len // 10000000) // num_threads) * num_threads), num_threads) split_region_files = split_region(work, regions_bed_file, num_split, min_region=window_size, max_region=1e20) diff --git a/neusomatic/python/split_bed.py b/neusomatic/python/split_bed.py index 78f620b..f4d91ff 100755 --- a/neusomatic/python/split_bed.py +++ b/neusomatic/python/split_bed.py @@ -30,10 +30,10 @@ def split_region(work, region_bed_file, num_splits, max_region=1000000, min_regi intervals.append(region) if shuffle_intervals: shuffle(intervals) - regions_bed = pybedtools.BedTool(intervals) + regions_bed = pybedtools.BedTool(intervals).saveas() total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed)) logger.info("Total length: {}".format(total_len)) - split_len = total_len / num_splits + split_len = total_len // num_splits split_regions = [] current_regions = [] sofar_len = 0 diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 1d67d02..d7a06e9 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -41,21 +41,21 @@ def make_weights_for_balanced_classes(count_class_t, count_class_l, nclasses_t, count_class_l[0] = none_count logger.info("count type classes: {}".format( - zip(vartype_classes, count_class_t))) + list(zip(vartype_classes, count_class_t)))) N = float(sum(count_class_t)) for i in range(nclasses_t): w_t[i] = (1 - (float(count_class_t[i]) / float(N))) / float(nclasses_t) w_t = np.array(w_t) - logger.info("weight type classes: {}".format(zip(vartype_classes, w_t))) + logger.info("weight type classes: {}".format(list(zip(vartype_classes, w_t)))) - logger.info("count length classes: {}".format( - zip(range(nclasses_l), count_class_l))) + logger.info("count length classes: {}".format(list( + zip(range(nclasses_l), count_class_l)))) N = float(sum(count_class_l)) for i in range(nclasses_l): w_l[i] = (1 - (float(count_class_l[i]) / float(N))) / float(nclasses_l) w_l = np.array(w_l) - logger.info("weight length classes: {}".format( - zip(range(nclasses_l), w_l))) + logger.info("weight length classes: {}".format(list( + zip(range(nclasses_l), w_l)))) return w_t, w_l @@ -154,8 +154,8 @@ def __iter__(self): logger = logging.getLogger(SubsetNoneSampler.__iter__.__name__) if self.current_none_id > (len(self.none_indices) - self.none_count): this_round_nones = self.none_indices[self.current_none_id:] - self.none_indices = map(lambda i: self.none_indices[i], - torch.randperm(len(self.none_indices)).tolist()) + self.none_indices = list(map(lambda i: self.none_indices[i], + torch.randperm(len(self.none_indices)).tolist())) self.current_none_id = self.none_count - len(this_round_nones) this_round_nones += self.none_indices[0:self.current_none_id] else: @@ -221,10 +221,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo # 1. filter out unnecessary keys # pretrained_state_dict = { # k: v for k, v in pretrained_state_dict.items() if k in model_dict} - if "module." in pretrained_state_dict.keys()[0] and "module." not in model_dict.keys()[0]: + if "module." in list(pretrained_state_dict.keys())[0] and "module." not in list(model_dict.keys())[0]: pretrained_state_dict = {k.split("module.")[1]: v for k, v in pretrained_state_dict.items( ) if k.split("module.")[1] in model_dict} - elif "module." not in pretrained_state_dict.keys()[0] and "module." in model_dict.keys()[0]: + elif "module." not in list(pretrained_state_dict.keys())[0] and "module." in list(model_dict.keys())[0]: pretrained_state_dict = { ("module." + k): v for k, v in pretrained_state_dict.items() if ("module." + k) in model_dict} else: @@ -251,11 +251,11 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo Ls = [] for tsv in candidates_tsv: - idx = pickle.load(open(tsv + ".idx")) + idx = pickle.load(open(tsv + ".idx", "rb")) Ls.append(len(idx) - 1) - Ls, candidates_tsv = zip( - *sorted(zip(Ls, candidates_tsv), key=lambda x: x[0], reverse=True)) + Ls, candidates_tsv = list(zip( + *sorted(zip(Ls, candidates_tsv), key=lambda x: x[0], reverse=True))) train_split_tsvs = [] current_L = 0 @@ -287,13 +287,13 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo none_indices = train_set.get_none_indices() var_indices = train_set.get_var_indices() if none_indices: - none_indices = map(lambda i: none_indices[i], - torch.randperm(len(none_indices)).tolist()) + none_indices = list(map(lambda i: none_indices[i], + torch.randperm(len(none_indices)).tolist())) logger.info( "Non-somatic candidates is split {}: {}".format(split_i, len(none_indices))) if var_indices: - var_indices = map(lambda i: var_indices[i], - torch.randperm(len(var_indices)).tolist()) + var_indices = list(map(lambda i: var_indices[i], + torch.randperm(len(var_indices)).tolist())) logger.info("Somatic candidates in split {}: {}".format( split_i, len(var_indices))) none_count = max(min(len(none_indices), len( diff --git a/neusomatic/python/utils.py b/neusomatic/python/utils.py index c09e543..b8eb237 100755 --- a/neusomatic/python/utils.py +++ b/neusomatic/python/utils.py @@ -48,8 +48,8 @@ def concatenate_files(infiles, outfile, check_file_existence=True): def concatenate_vcfs(infiles, outfile, check_file_existence=True, header_string="#"): with open(outfile, "w") as out_fd: # Only keep files which exist - files_to_process = filter(lambda f: f and ( - not check_file_existence or os.path.isfile(f)), infiles) + files_to_process = list(filter(lambda f: f and ( + not check_file_existence or os.path.isfile(f)), infiles)) for index, infile in enumerate(files_to_process): with open(infile) as in_fd: diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf index 72ff1b7..a00b347 100644 --- a/test/NeuSomatic_ensemble.vcf +++ b/test/NeuSomatic_ensemble.vcf @@ -11,10 +11,10 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE 22 21330787 . C T 26.9905 PASS SCORE=0.9980;DP=320;RO=231;AO=87;AF=0.2736; GT:DP:RO:AO:AF 0/1:320:231:87:0.2736 -22 21332122 . G A 40.0000 PASS SCORE=0.9999;DP=210;RO=152;AO=58;AF=0.2762; GT:DP:RO:AO:AF 0/1:210:152:58:0.2762 +22 21332122 . G A 39.9993 PASS SCORE=0.9999;DP=210;RO=152;AO=58;AF=0.2762; GT:DP:RO:AO:AF 0/1:210:152:58:0.2762 22 21334924 . G C 22.2248 PASS SCORE=0.9940;DP=78;RO=58;AO=20;AF=0.2564; GT:DP:RO:AO:AF 0/1:78:58:20:0.2564 -22 21335259 . C A 29.2086 PASS SCORE=0.9988;DP=205;RO=158;AO=47;AF=0.2293; GT:DP:RO:AO:AF 0/1:205:158:47:0.2293 +22 21335259 . C A 29.2085 PASS SCORE=0.9988;DP=205;RO=158;AO=47;AF=0.2293; GT:DP:RO:AO:AF 0/1:205:158:47:0.2293 22 21384516 . C T 33.0106 PASS SCORE=0.9995;DP=78;RO=51;AO=27;AF=0.3462; GT:DP:RO:AO:AF 0/1:78:51:27:0.3462 -22 21982892 . C T 25.6891 PASS SCORE=0.9973;DP=131;RO=86;AO=45;AF=0.3435; GT:DP:RO:AO:AF 0/1:131:86:45:0.3435 -22 21983260 . A G 26.5777 PASS SCORE=0.9978;DP=85;RO=41;AO=44;AF=0.5176; GT:DP:RO:AO:AF 0/1:85:41:44:0.5176 -22 21989959 . AAG A 36.9899 PASS SCORE=0.9998;DP=100;RO=69;AO=29;AF=0.2959; GT:DP:RO:AO:AF 0/1:100:69:29:0.2959 +22 21982892 . C T 25.6892 PASS SCORE=0.9973;DP=131;RO=86;AO=45;AF=0.3435; GT:DP:RO:AO:AF 0/1:131:86:45:0.3435 +22 21983260 . A G 26.5776 PASS SCORE=0.9978;DP=85;RO=41;AO=44;AF=0.5176; GT:DP:RO:AO:AF 0/1:85:41:44:0.5176 +22 21989959 . AAG A 36.9896 PASS SCORE=0.9998;DP=100;RO=69;AO=29;AF=0.2959; GT:DP:RO:AO:AF 0/1:100:69:29:0.2959 diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf index 3288f06..20b54fd 100644 --- a/test/NeuSomatic_standalone.vcf +++ b/test/NeuSomatic_standalone.vcf @@ -10,11 +10,11 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE -22 21330787 . C T 33.9797 PASS SCORE=0.9996;DP=320;RO=231;AO=87;AF=0.2736; GT:DP:RO:AO:AF 0/1:320:231:87:0.2736 -22 21332122 . G A 36.9897 PASS SCORE=0.9998;DP=210;RO=152;AO=58;AF=0.2762; GT:DP:RO:AO:AF 0/1:210:152:58:0.2762 +22 21330787 . C T 33.9800 PASS SCORE=0.9996;DP=320;RO=231;AO=87;AF=0.2736; GT:DP:RO:AO:AF 0/1:320:231:87:0.2736 +22 21332122 . G A 36.9903 PASS SCORE=0.9998;DP=210;RO=152;AO=58;AF=0.2762; GT:DP:RO:AO:AF 0/1:210:152:58:0.2762 22 21334924 . G C 4.5474 LowQual SCORE=0.6490;DP=78;RO=58;AO=20;AF=0.2564; GT:DP:RO:AO:AF 0/1:78:58:20:0.2564 22 21335259 . C A 25.8517 PASS SCORE=0.9974;DP=205;RO=158;AO=47;AF=0.2293; GT:DP:RO:AO:AF 0/1:205:158:47:0.2293 22 21384516 . C T 8.2334 PASS SCORE=0.8498;DP=78;RO=51;AO=27;AF=0.3462; GT:DP:RO:AO:AF 0/1:78:51:27:0.3462 -22 21982892 . C T 30.0011 PASS SCORE=0.9990;DP=131;RO=86;AO=45;AF=0.3435; GT:DP:RO:AO:AF 0/1:131:86:45:0.3435 -22 21983260 . A G 31.5496 PASS SCORE=0.9993;DP=85;RO=41;AO=44;AF=0.5176; GT:DP:RO:AO:AF 0/1:85:41:44:0.5176 -22 21989959 . AAG A 40.0000 PASS SCORE=0.9999;DP=100;RO=69;AO=29;AF=0.2959; GT:DP:RO:AO:AF 0/1:100:69:29:0.2959 +22 21982892 . C T 30.0008 PASS SCORE=0.9990;DP=131;RO=86;AO=45;AF=0.3435; GT:DP:RO:AO:AF 0/1:131:86:45:0.3435 +22 21983260 . A G 31.5498 PASS SCORE=0.9993;DP=85;RO=41;AO=44;AF=0.5176; GT:DP:RO:AO:AF 0/1:85:41:44:0.5176 +22 21989959 . AAG A 39.9993 PASS SCORE=0.9999;DP=100;RO=69;AO=29;AF=0.2959; GT:DP:RO:AO:AF 0/1:100:69:29:0.2959 From 8269698004eb8ed98437832e872d012912ac6804 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 31 Jan 2019 14:34:53 -0800 Subject: [PATCH 3/7] few fixes --- .gitignore | 1 + neusomatic/python/long_read_indelrealign.py | 46 ++++++++++----------- neusomatic/python/postprocess.py | 10 ++--- neusomatic/python/utils.py | 4 +- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 8ec2c20..111da70 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ third_party/* !third_party/*.cmake neusomatic/build/ neusomatic/bin/* +neusomatic/python/__pycache__/ others/ neusomatic/python/*.pyc test/example/ diff --git a/neusomatic/python/long_read_indelrealign.py b/neusomatic/python/long_read_indelrealign.py index 1f2da92..7b4ee36 100755 --- a/neusomatic/python/long_read_indelrealign.py +++ b/neusomatic/python/long_read_indelrealign.py @@ -368,9 +368,9 @@ def prepare_fasta(work, region, input_bam, ref_fasta_file, include_ref, split_i) return in_fasta_file, info_file -def split_bam_to_chuncks(work, region, input_bam, chunck_size=200, - chunck_scale=1.5): - logger = logging.getLogger(split_bam_to_chuncks.__name__) +def split_bam_to_chunks(work, region, input_bam, chunk_size=200, + chunk_scale=1.5): + logger = logging.getLogger(split_bam_to_chunks.__name__) records = [] with pysam.Samfile(input_bam, "rb") as samfile: for record in samfile.fetch(region.chrom, region.start, region.end + 1): @@ -404,22 +404,22 @@ def split_bam_to_chuncks(work, region, input_bam, chunck_size=200, records.append([record, len(q_seq)]) records = list(map(lambda x: x[0], sorted(records, key=lambda x: x[1]))) - if len(records) < chunck_size * chunck_scale: + if len(records) < chunk_size * chunk_scale: bams = [input_bam] lens = [len(records)] else: - n_splits = int(max(6, len(records) // chunck_size)) - new_chunck_size = len(records) // n_splits + n_splits = int(max(6, len(records) // chunk_size)) + new_chunk_size = len(records) // n_splits bams = [] lens = [] - n_split = (len(records) // new_chunck_size) + 1 - if 0 < (len(records) - ((n_split - 1) * new_chunck_size) + new_chunck_size) \ - < new_chunck_size * chunck_scale: + n_split = (len(records) // new_chunk_size) + 1 + if 0 < (len(records) - ((n_split - 1) * new_chunk_size) + new_chunk_size) \ + < new_chunk_size * chunk_scale: n_split -= 1 for i in range(n_split): - i_start = i * new_chunck_size + i_start = i * new_chunk_size i_end = (i + 1) * \ - new_chunck_size if i < (n_split - 1) else len(records) + new_chunk_size if i < (n_split - 1) else len(records) split_input_bam = os.path.join( work, region.__str__() + "_split_{}.bam".format(i)) with pysam.AlignmentFile(input_bam, "rb") as samfile: @@ -442,8 +442,8 @@ def read_info(info_file): logger = logging.getLogger(read_info.__name__) info = {} with open(info_file, 'r') as csvfile: - spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|') - for row in spamreader: + csvreader = csv.reader(csvfile, delimiter='\t', quotechar='|') + for row in csvreader: info[int(row[0])] = Read_Info(row[1:]) return info @@ -900,7 +900,7 @@ def TrimREFALT(ref, alt, pos): def run_realignment(input_record): - work, ref_fasta_file, target_region, pad, chunck_size, chunck_scale, \ + work, ref_fasta_file, target_region, pad, chunk_size, chunk_scale, \ snp_min_af, del_min_af, ins_min_af, len_chr, input_bam, \ match_score, mismatch_penalty, gap_open_penalty, gap_ext_penalty, \ msa_binary, get_var = input_record @@ -918,8 +918,8 @@ def run_realignment(input_record): pybedtools.set_tempdir(pybedtmp) variant = [] all_entries = [] - input_bam_splits, lens_splits = split_bam_to_chuncks( - work, region, input_bam, chunck_size, chunck_scale) + input_bam_splits, lens_splits = split_bam_to_chunks( + work, region, input_bam, chunk_size, chunk_scale) new_seqs = [] new_ref_seq = "" skipped = 0 @@ -1163,7 +1163,7 @@ def extend_regions_repeat(region_bed_file, extended_region_bed_file, ref_fasta_f def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_file, ref_fasta_file, num_threads, pad, - chunck_size, chunck_scale, snp_min_af, del_min_af, ins_min_af, + chunk_size, chunk_scale, snp_min_af, del_min_af, ins_min_af, match_score, mismatch_penalty, gap_open_penalty, gap_ext_penalty, msa_binary): logger = logging.getLogger(long_read_indelrealign.__name__) @@ -1210,8 +1210,8 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f pool = multiprocessing.Pool(num_threads) map_args = [] for target_region in target_regions: - map_args.append((work, ref_fasta_file, target_region, pad, chunck_size, - chunck_scale, snp_min_af, del_min_af, ins_min_af, + map_args.append((work, ref_fasta_file, target_region, pad, chunk_size, + chunk_scale, snp_min_af, del_min_af, ins_min_af, chrom_lengths[target_region[0]], input_bam, match_score, mismatch_penalty, gap_open_penalty, gap_ext_penalty, msa_binary, get_var)) @@ -1299,9 +1299,9 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f help='number of threads', default=1) parser.add_argument('--pad', type=int, help='#base padding to the regions', default=1) - parser.add_argument('--chunck_size', type=int, + parser.add_argument('--chunk_size', type=int, help='chuck split size for high depth', default=600) - parser.add_argument('--chunck_scale', type=float, + parser.add_argument('--chunk_scale', type=float, help='chuck scale size for high depth', default=1.5) parser.add_argument('--snp_min_af', type=float, help='SNP min allele freq', default=0.05) @@ -1325,8 +1325,8 @@ def long_read_indelrealign(work, input_bam, output_bam, output_vcf, region_bed_f try: processor = long_read_indelrealign(args.work, args.input_bam, args.output_bam, args.output_vcf, args.region_bed, args.reference, - args.num_threads, args.pad, args.chunck_size, - args.chunck_scale, args.snp_min_af, args.del_min_af, + args.num_threads, args.pad, args.chunk_size, + args.chunk_scale, args.snp_min_af, args.del_min_af, args.ins_min_af, args.match_score, args.mismatch_penalty, args.gap_open_penalty, args.gap_ext_penalty, args.msa_binary) diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 87834f2..d3d9a63 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -143,7 +143,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ensemble_tsv, tumor_bam, min_len, postprocess_max_dist, long_read, - lr_pad, lr_chunck_size, lr_chunck_scale, + lr_pad, lr_chunk_size, lr_chunk_scale, lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, pass_threshold, lowqual_threshold, @@ -183,7 +183,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense work, "candidates_preds.ra_resolved.vcf") long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, target_bed, reference, num_threads, lr_pad, - lr_chunck_size, lr_chunck_scale, lr_snp_min_af, + lr_chunk_size, lr_chunk_scale, lr_snp_min_af, lr_del_min_af, lr_ins_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, msa_binary) @@ -233,9 +233,9 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense '--long_read', help='Enable long_read (high error-rate sequence) indel realignment', action="store_true") parser.add_argument( '--lr_pad', type=int, help='long_read indel realign: #base padding to the regions', default=1) - parser.add_argument('--lr_chunck_size', type=int, + parser.add_argument('--lr_chunk_size', type=int, help='long_read indel realign: chuck split size for high depth', default=600) - parser.add_argument('--lr_chunck_scale', type=float, + parser.add_argument('--lr_chunk_scale', type=float, help='long_read indel realign: chuck scale size for high depth', default=1.5) parser.add_argument('--lr_snp_min_af', type=float, help='long_read indel realign: SNP min allele freq', default=0.05) @@ -270,7 +270,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense args.candidates_vcf, args.ensemble_tsv, args.tumor_bam, args.min_len, args.postprocess_max_dist, args.long_read, - args.lr_pad, args.lr_chunck_size, args.lr_chunck_scale, + args.lr_pad, args.lr_chunk_size, args.lr_chunk_scale, args.lr_snp_min_af, args.lr_ins_min_af, args.lr_del_min_af, args.lr_match_score, args.lr_mismatch_penalty, args.lr_gap_open_penalty, diff --git a/neusomatic/python/utils.py b/neusomatic/python/utils.py index b8eb237..c09e543 100755 --- a/neusomatic/python/utils.py +++ b/neusomatic/python/utils.py @@ -48,8 +48,8 @@ def concatenate_files(infiles, outfile, check_file_existence=True): def concatenate_vcfs(infiles, outfile, check_file_existence=True, header_string="#"): with open(outfile, "w") as out_fd: # Only keep files which exist - files_to_process = list(filter(lambda f: f and ( - not check_file_existence or os.path.isfile(f)), infiles)) + files_to_process = filter(lambda f: f and ( + not check_file_existence or os.path.isfile(f)), infiles) for index, infile in enumerate(files_to_process): with open(infile) as in_fd: From 7f57f89b749f8694199758271fd8ff6ff13d82a2 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 27 Feb 2019 16:25:00 -0800 Subject: [PATCH 4/7] updated version to 0.2.0 --- .gitignore | 1 - README.md | 13 ++++++----- docker/Dockerfile | 41 ++++++++++++++++++---------------- neusomatic/python/_version.py | 2 +- neusomatic/python/call.py | 4 ++-- test/NeuSomatic_ensemble.vcf | 2 +- test/NeuSomatic_standalone.vcf | 2 +- test/docker_test.sh | 18 +++++++-------- 8 files changed, 43 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 8868922..903e584 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ third_party/* neusomatic/python/__pycache__/ neusomatic/build/ neusomatic/bin/* -neusomatic/python/__pycache__/ others/ neusomatic/python/*.pyc test/example/ diff --git a/README.md b/README.md index f0f62f6..2cbff6d 100644 --- a/README.md +++ b/README.md @@ -29,32 +29,33 @@ doi: https://doi.org/10.1101/393801](https://doi.org/10.1101/393801) ## Availability -NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 1.0.0 to enable GPU acceleration for training/testing. +NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 1.0.1 to enable GPU acceleration for training/testing. NeuSomatic first scans the genome to identify candidate variants and extract alignment information. The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.13.2 and g++ 5.4.0). -Python 2.7 and the following Python packages must be installed: -* pytorch 1.0.0 +Python 3.7 and the following Python packages must be installed: +* pytorch 1.0.1 * torchvision 0.2.1 * pybedtools 0.8.0 * pysam 0.15.2 * zlib 1.2.11 * numpy 1.15.4 * scipy 1.2.0 +* imageio 2.5.0 * biopython 1.73 It also depends on the following packages: -* cuda80 1.0 (if you want to use GPU) +* cudatoolkit 8.0 (if you want to use GPU) * tabix 0.2.6 * bedtools 2.27.1 * samtools 1.9 You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) : ``` -conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 +conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 imageio=2.5.0 conda install pysam=0.15.2 pybedtools=0.8.0 samtools=1.9 tabix=0.2.6 bedtools=2.27.1 biopython=1.73 -c bioconda -conda install pytorch=1.0.0 torchvision=0.2.1 cuda80==1.0 -c pytorch +conda install pytorch=1.0.1 torchvision=0.2.1 cudatoolkit=8.0 -c pytorch ``` Then you can export the conda paths as: ``` diff --git a/docker/Dockerfile b/docker/Dockerfile index ff905d7..861e9e0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,40 +1,43 @@ FROM ubuntu:16.04 -ENV NEUSOMATIC_VERSION 0.1.4 +ENV NEUSOMATIC_VERSION 0.2.0 ENV ZLIB_VERSION 1.2.11 -ENV NUMPY_VERSION 1.14.3 -ENV SCIPY_VERSION 1.1.0 -ENV PYTORCH_VERSION 0.3.1 -ENV TORCHVISION_VERSION 0.2.0 -ENV CMAKE_VERSION 3.12.1 -ENV PYBEDTOOLS_VERSION 0.7.10 -ENV PYSAM_VERSION 0.14.1 -ENV SAMTOOLS_VERSION 1.7 -ENV TABIX_VERSION 0.2.5 +ENV NUMPY_VERSION 1.15.4 +ENV SCIPY_VERSION 1.2.0 +ENV IMAGEIO_VERSION 2.5.0 +ENV PYTORCH_VERSION 1.0.1 +ENV TORCHVISION_VERSION 0.2.1 +ENV CUDATOOLKIT_VERSION=8.0 +ENV CMAKE_VERSION 3.13.2 +ENV PYBEDTOOLS_VERSION 0.8.0 +ENV PYSAM_VERSION 0.15.2 +ENV SAMTOOLS_VERSION 1.9 +ENV TABIX_VERSION 0.2.6 ENV BEDTOOLS_VERSION 2.27.1 -ENV BIOPYTHON_VERSION 1.68 +ENV BIOPYTHON_VERSION 1.73 ENV GCC_VERSION 5 RUN apt-get update && apt-get install -y --fix-missing \ build-essential zlib1g-dev curl less vim bzip2 RUN apt-get install -y --fix-missing git wget -RUN curl -LO http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -RUN bash Miniconda-latest-Linux-x86_64.sh -p /miniconda -b -RUN rm Miniconda-latest-Linux-x86_64.sh +RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b +RUN rm Miniconda3-latest-Linux-x86_64.sh ENV PATH=/miniconda/bin:${PATH} ENV LD_LIBRARY_PATH=/miniconda/lib:${LD_LIBRARY_PATH} RUN conda update -y conda -RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} \ - scipy=${SCIPY_VERSION} && conda clean -a -RUN conda install -y pytorch=${PYTORCH_VERSION} \ - torchvision=${TORCHVISION_VERSION} -c pytorch && conda clean -a -RUN conda install -y cmake=${CMAKE_VERSION} -c conda-forge && conda clean -a + +RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION}\ + imageio=${IMAGEIO_VERSION} cmake=${CMAKE_VERSION} && conda clean -a RUN conda install -y pysam=${PYSAM_VERSION} pybedtools=${PYBEDTOOLS_VERSION} \ samtools=${SAMTOOLS_VERSION} tabix=${TABIX_VERSION} \ bedtools=${BEDTOOLS_VERSION} \ biopython=${BIOPYTHON_VERSION} -c bioconda && conda clean -a +RUN conda install -y pytorch=${PYTORCH_VERSION} \ + torchvision=${TORCHVISION_VERSION} \ + cudatoolkit=${CUDATOOLKIT_VERSION}-c pytorch && conda clean -a RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION} diff --git a/neusomatic/python/_version.py b/neusomatic/python/_version.py index bbab024..d3ec452 100755 --- a/neusomatic/python/_version.py +++ b/neusomatic/python/_version.py @@ -1 +1 @@ -__version__ = "0.1.4" +__version__ = "0.2.0" diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index da375d2..2df7aba 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -14,7 +14,7 @@ import pysam import numpy as np -from scipy.misc import imsave, imread +from imageio import imwrite, imread import torch from torch.autograd import Variable import torch.nn as nn @@ -88,7 +88,7 @@ def call_variants(net, vartype_classes, call_loader, out_dir, model_tag, use_cud file_name = "{}/matrices_{}/{}.png".format( out_dir, model_tag, path) if not os.path.exists(file_name): - imsave(file_name, non_transformed_matrices[i, :, :, 0:3]) + imwrite(file_name, non_transformed_matrices[i, :, :, 0:3]) true_path[path] = file_name final_preds[path] = [vartype_classes[predicted[i]], pos_pred[i], len_pred[i], list(map(lambda x: round(x, 4), F.softmax( diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf index 92b2aca..e037be6 100644 --- a/test/NeuSomatic_ensemble.vcf +++ b/test/NeuSomatic_ensemble.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##NeuSomatic Version=0.1.4 +##NeuSomatic Version=0.2.0 ##FORMAT= ##FILTER= ##FILTER= diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf index 6bee259..47bf078 100644 --- a/test/NeuSomatic_standalone.vcf +++ b/test/NeuSomatic_standalone.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##NeuSomatic Version=0.1.4 +##NeuSomatic Version=0.2.0 ##FORMAT= ##FILTER= ##FILTER= diff --git a/test/docker_test.sh b/test/docker_test.sh index b19ba53..a3fafba 100755 --- a/test/docker_test.sh +++ b/test/docker_test.sh @@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ] then if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa" fi rm -rf work_standalone @@ -27,7 +27,7 @@ rm -rf work_standalone #Stand-alone NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 --num_threads 1 \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --num_threads 1 \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ @@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 rm -rf /mnt/example/work_ensemble #Ensemble NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 --ensemble_tsv /mnt/ensemble.tsv \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --ensemble \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.1.4 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ From 46e772277284aab677d54dc3968862b4232053ab Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 27 Feb 2019 23:35:32 -0800 Subject: [PATCH 5/7] fixed docker file --- docker/Dockerfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 861e9e0..ee811b0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -7,14 +7,14 @@ ENV SCIPY_VERSION 1.2.0 ENV IMAGEIO_VERSION 2.5.0 ENV PYTORCH_VERSION 1.0.1 ENV TORCHVISION_VERSION 0.2.1 -ENV CUDATOOLKIT_VERSION=8.0 +ENV CUDATOOLKIT_VERSION 8.0 ENV CMAKE_VERSION 3.13.2 ENV PYBEDTOOLS_VERSION 0.8.0 ENV PYSAM_VERSION 0.15.2 ENV SAMTOOLS_VERSION 1.9 ENV TABIX_VERSION 0.2.6 ENV BEDTOOLS_VERSION 2.27.1 -ENV BIOPYTHON_VERSION 1.73 +ENV BIOPYTHON_VERSION 1.72 ENV GCC_VERSION 5 RUN apt-get update && apt-get install -y --fix-missing \ @@ -29,15 +29,16 @@ ENV LD_LIBRARY_PATH=/miniconda/lib:${LD_LIBRARY_PATH} RUN conda update -y conda -RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION}\ - imageio=${IMAGEIO_VERSION} cmake=${CMAKE_VERSION} && conda clean -a +RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \ + imageio=${IMAGEIO_VERSION} && conda clean -a +RUN conda install -y cmake=${CMAKE_VERSION} -c conda-forge && conda clean -a RUN conda install -y pysam=${PYSAM_VERSION} pybedtools=${PYBEDTOOLS_VERSION} \ samtools=${SAMTOOLS_VERSION} tabix=${TABIX_VERSION} \ bedtools=${BEDTOOLS_VERSION} \ biopython=${BIOPYTHON_VERSION} -c bioconda && conda clean -a RUN conda install -y pytorch=${PYTORCH_VERSION} \ torchvision=${TORCHVISION_VERSION} \ - cudatoolkit=${CUDATOOLKIT_VERSION}-c pytorch && conda clean -a + cudatoolkit=${CUDATOOLKIT_VERSION} -c pytorch && conda clean -a RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION} From e77c242142fca4c14eb0e5570e246594716a8b14 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 28 Feb 2019 11:20:33 -0800 Subject: [PATCH 6/7] some loggings added --- neusomatic/python/call.py | 4 ++++ neusomatic/python/train.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 2680109..04f9d2b 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -404,7 +404,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, num_channels = 119 if ensemble else 26 net = NeuSomaticNet(num_channels) if use_cuda: + logger.info("GPU calling!") net.cuda() + else: + logger.info("CPU calling!") + if torch.cuda.device_count() > 1: logger.info("We use {} GPUs!".format(torch.cuda.device_count())) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 7e4903e..8dbb86e 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -204,7 +204,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo num_channels = 119 if ensemble else 26 net = NeuSomaticNet(num_channels) if use_cuda: + logger.info("GPU training!") net.cuda() + else: + logger.info("CPU training!") if torch.cuda.device_count() > 1: logger.info("We use {} GPUs!".format(torch.cuda.device_count())) From df51e9d25b2644c85e3afc15e3291afaa50a1999 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 28 Feb 2019 16:24:30 -0800 Subject: [PATCH 7/7] pytorch version logging added --- neusomatic/python/call.py | 3 +++ neusomatic/python/train.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 04f9d2b..0f13cd1 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -20,6 +20,7 @@ import torch.nn as nn import torch.nn.functional as F from torchvision import transforms +import torchvision from network import NeuSomaticNet from dataloader import NeuSomaticDataset @@ -391,6 +392,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("-----------------Call Somatic Mutations--------------------") + logger.info("PyTorch Version: {}".format(torch.__version__)) + logger.info("Torchvision Version: {}".format(torchvision.__version__)) if not use_cuda: torch.set_num_threads(num_threads) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 8dbb86e..a8620d3 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -16,6 +16,7 @@ import torch.nn.functional as F import torch.optim as optim from torchvision import transforms +import torchvision from random import shuffle import pickle @@ -191,6 +192,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo logger = logging.getLogger(train_neusomatic.__name__) logger.info("----------------Train NeuSomatic Network-------------------") + logger.info("PyTorch Version: {}".format(torch.__version__)) + logger.info("Torchvision Version: {}".format(torchvision.__version__)) if not os.path.exists(out_dir): os.mkdir(out_dir)