Merge pull request #30 from bioinform/update_to_python3

Update to python3
bioinform · Mar 1, 2019 · d2dd889 · d2dd889
2 parents 81dc194 + df51e9d
commit d2dd889
Show file tree

Hide file tree

Showing 21 changed files with 436 additions and 404 deletions.
diff --git a/README.md b/README.md
@@ -29,33 +29,33 @@ doi: https://doi.org/10.1101/393801](https://doi.org/10.1101/393801)
 
 ## Availability
 
-NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 0.3.1 to enable GPU acceleration for training/testing.
+NeuSomatic is written in Python and C++ and requires a Unix-like environment to run. It has been sucessfully tested on CentOS 7. Its deep learning framework is implemented using PyTorch 1.0.1 to enable GPU acceleration for training/testing.
 
 NeuSomatic first scans the genome to identify candidate variants and extract alignment information. 
-The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.12.1 and g++ 5.4.0).
+The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.13.2 and g++ 5.4.0).
 
-Python 2.7 and the following Python packages must be installed:
-* pytorch 0.3.1
-* torchvision 0.2.0
-* pybedtools 0.7.10
-* pysam 0.14.1
+Python 3.7 and the following Python packages must be installed:
+* pytorch 1.0.1
+* torchvision 0.2.1
+* pybedtools 0.8.0
+* pysam 0.15.2
 * zlib 1.2.11
-* numpy 1.14.3
-* scipy 1.1.0
-* biopython 1.68
+* numpy 1.15.4
+* scipy 1.2.0
+* imageio 2.5.0
+* biopython 1.73
 
 It also depends on the following packages:
-* cuda80 1.0 (if you want to use GPU)
-* tabix 0.2.5
+* cudatoolkit 8.0 (if you want to use GPU)
+* tabix 0.2.6
 * bedtools 2.27.1
-* samtools 1.7
+* samtools 1.9
 
 You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) :
 ```
-conda install zlib=1.2.11 numpy=1.14.3 scipy=1.1.0 
-conda install pytorch=0.3.1 torchvision=0.2.0 cuda80=1.0 -c pytorch
-conda install cmake=3.12.1 -c conda-forge
-conda install pysam=0.14.1 pybedtools=0.7.10 samtools=1.7 tabix=0.2.5 bedtools=2.27.1 biopython=1.68 -c bioconda
+conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 imageio=2.5.0
+conda install pysam=0.15.2 pybedtools=0.8.0 samtools=1.9 tabix=0.2.6 bedtools=2.27.1 biopython=1.73 -c bioconda
+conda install pytorch=1.0.1 torchvision=0.2.1 cudatoolkit=8.0 -c pytorch
 ```
 Then you can export the conda paths as:
 ```

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,40 +1,44 @@
 FROM ubuntu:16.04
 
-ENV NEUSOMATIC_VERSION 0.1.4
+ENV NEUSOMATIC_VERSION 0.2.0
 ENV ZLIB_VERSION 1.2.11
-ENV NUMPY_VERSION 1.14.3
-ENV SCIPY_VERSION 1.1.0
-ENV PYTORCH_VERSION 0.3.1
-ENV TORCHVISION_VERSION 0.2.0
-ENV CMAKE_VERSION 3.12.1
-ENV PYBEDTOOLS_VERSION 0.7.10
-ENV PYSAM_VERSION 0.14.1
-ENV SAMTOOLS_VERSION 1.7
-ENV TABIX_VERSION 0.2.5
+ENV NUMPY_VERSION 1.15.4
+ENV SCIPY_VERSION 1.2.0
+ENV IMAGEIO_VERSION 2.5.0
+ENV PYTORCH_VERSION 1.0.1
+ENV TORCHVISION_VERSION 0.2.1
+ENV CUDATOOLKIT_VERSION 8.0
+ENV CMAKE_VERSION 3.13.2
+ENV PYBEDTOOLS_VERSION 0.8.0
+ENV PYSAM_VERSION 0.15.2
+ENV SAMTOOLS_VERSION 1.9
+ENV TABIX_VERSION 0.2.6
 ENV BEDTOOLS_VERSION 2.27.1
-ENV BIOPYTHON_VERSION 1.68
+ENV BIOPYTHON_VERSION 1.72
 ENV GCC_VERSION 5
 
 RUN apt-get update && apt-get install -y --fix-missing \
 				build-essential zlib1g-dev curl less vim bzip2
 RUN apt-get install -y --fix-missing git wget
 
-RUN curl -LO http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh
-RUN bash Miniconda-latest-Linux-x86_64.sh -p /miniconda -b
-RUN rm Miniconda-latest-Linux-x86_64.sh
+RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b
+RUN rm Miniconda3-latest-Linux-x86_64.sh
 ENV PATH=/miniconda/bin:${PATH}
 ENV LD_LIBRARY_PATH=/miniconda/lib:${LD_LIBRARY_PATH}
 RUN conda update -y conda
 
-RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} \
-					 scipy=${SCIPY_VERSION} && conda clean -a
-RUN conda install -y pytorch=${PYTORCH_VERSION} \
-					 torchvision=${TORCHVISION_VERSION} -c pytorch && conda clean -a
+
+RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \
+					 imageio=${IMAGEIO_VERSION} && conda clean -a
 RUN conda install -y cmake=${CMAKE_VERSION} -c conda-forge && conda clean -a
 RUN conda install -y pysam=${PYSAM_VERSION} pybedtools=${PYBEDTOOLS_VERSION} \
 					 samtools=${SAMTOOLS_VERSION} tabix=${TABIX_VERSION} \
 					 bedtools=${BEDTOOLS_VERSION} \
 					 biopython=${BIOPYTHON_VERSION} -c bioconda && conda clean -a
+RUN conda install -y pytorch=${PYTORCH_VERSION} \
+					 torchvision=${TORCHVISION_VERSION} \
+					 cudatoolkit=${CUDATOOLKIT_VERSION} -c pytorch && conda clean -a
 
 RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION}
 

diff --git a/neusomatic/python/_version.py b/neusomatic/python/_version.py
@@ -1 +1 @@
-__version__ = "0.1.4"
+__version__ = "0.2.0"
diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
@@ -14,12 +14,13 @@
 
 import pysam
 import numpy as np
-from scipy.misc import imsave, imread
+from imageio import imwrite, imread
 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision import transforms
+import torchvision
 
 from network import NeuSomaticNet
 from dataloader import NeuSomaticDataset
@@ -88,34 +89,35 @@ def call_variants(net, vartype_classes, call_loader, out_dir, model_tag, use_cud
                 file_name = "{}/matrices_{}/{}.png".format(
                     out_dir, model_tag, path)
                 if not os.path.exists(file_name):
-                    imsave(file_name, non_transformed_matrices[i, :, :, 0:3])
+                    imwrite(file_name, non_transformed_matrices[i, :, :, 0:3])
                 true_path[path] = file_name
                 final_preds[path] = [vartype_classes[predicted[i]], pos_pred[i], len_pred[i],
-                                     map(lambda x: round(x, 4), F.softmax(
-                                         outputs1[i, :], 0).data.cpu().numpy()),
-                                     map(lambda x: round(x, 4), F.softmax(
-                                         outputs3[i, :], 0).data.cpu().numpy()),
-                                     map(lambda x: round(x, 4),
-                                         outputs1.data.cpu()[i].numpy()),
-                                     map(lambda x: round(x, 4),
-                                         outputs3.data.cpu()[i].numpy())]
+                                     list(map(lambda x: round(x, 4), F.softmax(
+                                         outputs1[i, :], 0).data.cpu().numpy())),
+                                     list(map(lambda x: round(x, 4), F.softmax(
+                                         outputs3[i, :], 0).data.cpu().numpy())),
+                                     list(map(lambda x: round(x, 4),
+                                         outputs1.data.cpu()[i].numpy())),
+                                     list(map(lambda x: round(x, 4),
+                                         outputs3.data.cpu()[i].numpy()))]
             else:
                 none_preds[path] = [vartype_classes[predicted[i]], pos_pred[i], len_pred[i],
-                                    map(lambda x: round(x, 4), F.softmax(
-                                        outputs1[i, :], 0).data.cpu().numpy()),
-                                    map(lambda x: round(x, 4), F.softmax(
-                                        outputs3[i, :], 0).data.cpu().numpy()),
-                                    map(lambda x: round(x, 4),
-                                        outputs1.data.cpu()[i].numpy()),
-                                    map(lambda x: round(x, 4),
-                                        outputs3.data.cpu()[i].numpy())]
+                                    list(map(lambda x: round(x, 4), F.softmax(
+                                        outputs1[i, :], 0).data.cpu().numpy())),
+                                    list(map(lambda x: round(x, 4), F.softmax(
+                                        outputs3[i, :], 0).data.cpu().numpy())),
+                                    list(map(lambda x: round(x, 4),
+                                        outputs1.data.cpu()[i].numpy())),
+                                    list(map(lambda x: round(x, 4),
+                                        outputs3.data.cpu()[i].numpy()))]
         if (iii % 10 == 0):
             logger.info("Called {} candidates in this batch.".format(j))
     logger.info("Called {} candidates in this batch.".format(j))
     return final_preds, none_preds, true_path
 
 
-def pred_vcf_records_path((path, true_path_, pred_all, chroms, vartype_classes, ref_file)):
+def pred_vcf_records_path(record):
+    path, true_path_, pred_all, chroms, vartype_classes, ref_file = record
     thread_logger = logging.getLogger(
         "{} ({})".format(pred_vcf_records_path.__name__, multiprocessing.current_process().name))
     try:
@@ -208,7 +210,7 @@ def pred_vcf_records_path((path, true_path_, pred_all, chroms, vartype_classes,
         b = (anchor[0] - col_2_pos[anchor[1]])
         for i in nzref_pos:
             col_2_pos[i] += b
-        pos_2_col = {v: k for k, v in col_2_pos.iteritems()}
+        pos_2_col = {v: k for k, v in col_2_pos.items()}
 
         if abs(center_pred - center) < too_far_center:
             if type_pred == "SNP":
@@ -324,7 +326,7 @@ def pred_vcf_records(ref_file, final_preds, true_path, chroms, vartype_classes,
         if o is None:
             raise Exception("pred_vcf_records_path failed!")
 
-    all_vcf_records = filter(None, all_vcf_records)
+    all_vcf_records = list(filter(None, all_vcf_records))
 
     return all_vcf_records
 
@@ -361,7 +363,7 @@ def get_vcf_records(all_vcf_records):
 
 def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_threshold):
     logger = logging.getLogger(write_vcf.__name__)
-    vcf_records = filter(lambda x: len(x) > 0, vcf_records)
+    vcf_records = list(filter(lambda x: len(x) > 0, vcf_records))
     vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]])
     lines = []
     with open(output_vcf, "w") as ov:
@@ -390,6 +392,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     logger.info("-----------------Call Somatic Mutations--------------------")
 
+    logger.info("PyTorch Version: {}".format(torch.__version__))
+    logger.info("Torchvision Version: {}".format(torchvision.__version__))
     if not use_cuda:
         torch.set_num_threads(num_threads)
 
@@ -399,12 +403,15 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     vartype_classes = ['DEL', 'INS', 'NONE', 'SNP']
     data_transform = transforms.Compose(
-        [transforms.ToTensor(),
-         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+        [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
     num_channels = 119 if ensemble else 26
     net = NeuSomaticNet(num_channels)
     if use_cuda:
+        logger.info("GPU calling!")
         net.cuda()
+    else:
+        logger.info("CPU calling!")
+
 
     if torch.cuda.device_count() > 1:
         logger.info("We use {} GPUs!".format(torch.cuda.device_count()))
@@ -431,10 +438,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     # 1. filter out unnecessary keys
     # pretrained_state_dict = {
     #     k: v for k, v in pretrained_state_dict.items() if k in model_dict}
-    if "module." in pretrained_state_dict.keys()[0] and "module." not in model_dict.keys()[0]:
+    if "module." in list(pretrained_state_dict.keys())[0] and "module." not in list(model_dict.keys())[0]:
         pretrained_state_dict = {k.split("module.")[1]: v for k, v in pretrained_state_dict.items(
         ) if k.split("module.")[1] in model_dict}
-    elif "module." not in pretrained_state_dict.keys()[0] and "module." in model_dict.keys()[0]:
+    elif "module." not in list(pretrained_state_dict.keys())[0] and "module." in list(model_dict.keys())[0]:
         pretrained_state_dict = {
             ("module." + k): v for k, v in pretrained_state_dict.items()
             if ("module." + k) in model_dict}
@@ -456,7 +463,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     candidates_tsv_=[]
     split_i = 0
     for candidate_file in candidates_tsv:
-        idx = pickle.load(open(candidate_file + ".idx"))
+        idx = pickle.load(open(candidate_file + ".idx", "rb"))
         if len(idx) > max_load_candidates / 2:
             logger.info("Splitting {} of lenght {}".format(candidate_file, len(idx)))
             new_split_tsvs_dir_i=os.path.join(new_split_tsvs_dir,"split_{}".format(split_i))
@@ -471,7 +478,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                                overwrite_merged_tsvs=True,
                                                keep_none_types=True)
             for candidate_file_split in candidate_file_splits:
-                idx_split = pickle.load(open(candidate_file_split + ".idx"))
+                idx_split = pickle.load(open(candidate_file_split + ".idx", "rb"))
                 candidates_tsv_.append(candidate_file_split)
                 Ls.append(len(idx_split) - 1)
             split_i += 1

diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
@@ -36,7 +36,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i):
     tag = fields[2]
     im = extract_zlib(base64.b64decode(fields[3]))
     if len(fields) > 4:
-        anns = map(float, fields[4:])
+        anns = list(map(float, fields[4:]))
     else:
         anns = []
     label = type_class_dict[tag.split(".")[4]]
@@ -45,7 +45,8 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i):
     return tag, im, anns, label
 
 
-def extract_info_tsv((i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l)):
+def extract_info_tsv(record):
+    i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l = record
     thread_logger = logging.getLogger(
         "{} ({})".format(extract_info_tsv.__name__, multiprocessing.current_process().name))
     try:
@@ -89,7 +90,7 @@ def extract_info_tsv((i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasse
                     im = extract_zlib(base64.b64decode(fields[3]))
                     label = type_class_dict[tag.split(".")[4]]
                     if len(fields) > 4:
-                        anns = map(float, fields[4:])
+                        anns = list(map(float, fields[4:]))
                     else:
                         anns = []
                     data.append([tag, im, anns, label])
@@ -147,7 +148,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
                 self.tsvs.append(tsv)
                 for t in range(num_threads):
                     self.open_tsvs[t].append("")
-                idx = pickle.load(open(tsv + ".idx"))
+                idx = pickle.load(open(tsv + ".idx", "rb"))
                 self.idxs.append(idx)
                 self.Ls.append(len(idx) - 1)
         self.data = []
@@ -167,7 +168,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
             Ls_ = []
             for i_b, _ in batch:
                 tsv = self.tsvs[i_b]
-                max_load_ = self.Ls[i_b] * max_load_candidates / \
+                max_load_ = self.Ls[i_b] * max_load_candidates // \
                     total_L if total_L > 0 else 0
                 map_args.append([i_b, tsv, self.idxs[i_b], self.Ls[i_b],
                                  max_load_, nclasses_t, nclasses_l])
@@ -198,8 +199,8 @@ def __init__(self, roots, max_load_candidates, transform=None,
             for matrices, data, none_ids, var_ids, count_class_t, count_class_l in records_:
                 self.matrices += matrices
                 self.data += data
-                self.none_ids += map(lambda x: x + j, none_ids)
-                self.var_ids += map(lambda x: x + j, var_ids)
+                self.none_ids += list(map(lambda x: x + j, none_ids))
+                self.var_ids += list(map(lambda x: x + j, var_ids))
                 for k in range(nclasses_t):
                     self.count_class_t[k] += count_class_t[k]
                 for k in range(nclasses_l):
@@ -271,12 +272,12 @@ def __getitem__(self, index):
             h, w, c = matrix.shape
             r = random.rand()
             if r < 0.6:
-                x_left = random.randint((center - 2) * 2 / 3, center - 2)
+                x_left = random.randint((center - 2) * 2 // 3, center - 2)
             else:
                 x_left = 0
             if r > 0.4:
                 x_right = random.randint(
-                    (w - center - 2) * 2 / 3, w - center - 2)
+                    (w - center - 2) * 2 // 3, w - center - 2)
             else:
                 x_right = 0
             if x_left > 0:
@@ -385,9 +386,11 @@ def __getitem__(self, index):
             non_transformed_matrix = np.array(orig_matrix).astype(np.uint8)
         else:
             non_transformed_matrix = []
-
+
+        matrix = torch.from_numpy(matrix.transpose((2, 0, 1)))
+        matrix = matrix.float().div(255)
         if self.transform is not None:
-            matrix = self.transform(matrix)
+             matrix = self.transform(matrix)
 
         var_pos = [length, center]
         var_pos = torch.Tensor(var_pos)

diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py
@@ -34,7 +34,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
             if not record_set:
                 record_set.append(record)
                 continue
-            if len(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set)) > 0:
+            if len(list(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set))) > 0:
                 record_set.append(record)
                 continue
 
@@ -45,7 +45,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
 
         for ii, record_set in enumerate(record_sets):
             if len(record_set) > 1:
-                if filter(lambda x: len(x[2]) != len(x[3]), record_set):
+                if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)):
                     for x in record_set:
                         fields = x[-1].strip().split()
                         fields[2] = str(ii)