From 57a9c423643c4b1cb28e9d86300689a2c50c6d02 Mon Sep 17 00:00:00 2001
From: ctokheim <collintokheim@gmail.com>
Date: Mon, 13 Jun 2016 19:32:07 -0400
Subject: [PATCH] Changed memory management in simulations

---
 prob2020/python/permutation.py | 150 ++++++++++++++++++++-------------
 1 file changed, 91 insertions(+), 59 deletions(-)

diff --git a/prob2020/python/permutation.py b/prob2020/python/permutation.py
index 09cb0c4..c8df7e4 100644
--- a/prob2020/python/permutation.py
+++ b/prob2020/python/permutation.py
@@ -12,7 +12,8 @@ def deleterious_permutation(obs_del,
                             gene_seq,
                             num_permutations=10000,
                             stop_criteria=100,
-                            pseudo_count=0):
+                            pseudo_count=0,
+                            max_batch=1000):
     """Performs null-permutations for deleterious mutation statistics
     in a single gene.
 
@@ -46,34 +47,49 @@ def deleterious_permutation(obs_del,
                     for one_context in mycontexts
                     for base in context_to_mut[one_context]]
 
-    # get random positions determined by sequence context
-    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
-                                            num_permutations)
-    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)
+    # calculate the # of batches for simulations
+    max_batch = min(num_permutations, max_batch)
+    num_batches = num_permutations // max_batch
+    remainder = num_permutations % max_batch
+    batch_sizes = [max_batch] * num_batches
+    if remainder:
+        batch_sizes += [remainder]
 
-    # determine result of random positions
-    #del_count_list = []
     null_del_ct = 0
-    for i, row in enumerate(tmp_mut_pos):
-        # get info about mutations
-        tmp_mut_info = mc.get_aa_mut_info(row,
-                                          somatic_base,
-                                          gene_seq)
+    for j, batch_size in enumerate(batch_sizes):
+        # stop iterations if reached sufficient precision
+        if null_del_ct >= stop_criteria:
+            i = 0
+            break
 
-        # calc deleterious mutation info
-        tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'],
-                                                     tmp_mut_info['Somatic AA'],
-                                                     tmp_mut_info['Codon Pos'])
-        #del_count_list.append(tmp_del_count + pseudo_count)
+        # get random positions determined by sequence context
+        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
+                                                batch_size)
+        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)
 
-        # update empricial null distribution
-        if tmp_del_count >= obs_del: null_del_ct += 1
+        # determine result of random positions
+        #del_count_list = []
+        for i, row in enumerate(tmp_mut_pos):
+            # get info about mutations
+            tmp_mut_info = mc.get_aa_mut_info(row,
+                                              somatic_base,
+                                              gene_seq)
 
-        # stop if reach sufficient precision on p-value
-        if null_del_ct >= stop_criteria:
-            break
+            # calc deleterious mutation info
+            tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'],
+                                                         tmp_mut_info['Somatic AA'],
+                                                         tmp_mut_info['Codon Pos'])
+            #del_count_list.append(tmp_del_count + pseudo_count)
+
+            # update empricial null distribution
+            if tmp_del_count >= obs_del: null_del_ct += 1
+
+            # stop if reach sufficient precision on p-value
+            if null_del_ct >= stop_criteria:
+                break
 
-    del_pval = float(null_del_ct) / (i+1)
+    num_sim = j*max_batch + i+1
+    del_pval = float(null_del_ct) / (num_sim)
 
     #return del_count_list
     return del_pval
@@ -87,7 +103,8 @@ def position_permutation(obs_stat,
                          gene_vest=None,
                          num_permutations=10000,
                          stop_criteria=100,
-                         pseudo_count=0):
+                         pseudo_count=0,
+                         max_batch=1000):
     """Performs null-permutations for position-based mutation statistics
     in a single gene.
 
@@ -129,48 +146,63 @@ def position_permutation(obs_stat,
                     for one_context in mycontexts
                     for base in context_to_mut[one_context]]
 
-    # get random positions determined by sequence context
-    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
-                                            num_permutations)
-    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)
+    # calculate the # of batches for simulations
+    max_batch = min(num_permutations, max_batch)
+    num_batches = num_permutations // max_batch
+    remainder = num_permutations % max_batch
+    batch_sizes = [max_batch] * num_batches
+    if remainder:
+        batch_sizes += [remainder]
 
-    # calculate position-based statistics as a result of random positions
-    #stop_count = int(stop_criteria*num_permutations)
-    obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
-    #num_recur_list, entropy_list, delta_entropy_list, vest_list = [], [], [], []
     null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0
-    for i, row in enumerate(tmp_mut_pos):
-        # get info about mutations
-        tmp_mut_info = mc.get_aa_mut_info(row,
-                                          somatic_base,
-                                          gene_seq)
-
-        # calculate position info
-        tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
-                                                                               tmp_mut_info['Reference AA'],
-                                                                               tmp_mut_info['Somatic AA'],
-                                                                               pseudo_count=pseudo_count,
-                                                                               is_obs=0)
-        # get vest scores
-        if gene_vest:
-            tmp_vest = scores.compute_vest_stat(gene_vest,
-                                                tmp_mut_info['Reference AA'],
-                                                tmp_mut_info['Somatic AA'],
-                                                tmp_mut_info['Codon Pos'])
-        else:
-            tmp_vest = 0.0
-
-        # update empirical null distribution counts
-        if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
-        if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1
-
+    for j, batch_size in enumerate(batch_sizes):
         # stop iterations if reached sufficient precision
         if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
+            i = 0
             break
 
+        # get random positions determined by sequence context
+        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
+                                                num_permutations)
+        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)
+
+        # calculate position-based statistics as a result of random positions
+        #stop_count = int(stop_criteria*num_permutations)
+        obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
+        #num_recur_list, entropy_list, delta_entropy_list, vest_list = [], [], [], []
+        for i, row in enumerate(tmp_mut_pos):
+            # get info about mutations
+            tmp_mut_info = mc.get_aa_mut_info(row,
+                                            somatic_base,
+                                            gene_seq)
+
+            # calculate position info
+            tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
+                                                                                tmp_mut_info['Reference AA'],
+                                                                                tmp_mut_info['Somatic AA'],
+                                                                                pseudo_count=pseudo_count,
+                                                                                is_obs=0)
+            # get vest scores
+            if gene_vest:
+                tmp_vest = scores.compute_vest_stat(gene_vest,
+                                                    tmp_mut_info['Reference AA'],
+                                                    tmp_mut_info['Somatic AA'],
+                                                    tmp_mut_info['Codon Pos'])
+            else:
+                tmp_vest = 0.0
+
+            # update empirical null distribution counts
+            if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
+            if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1
+
+            # stop iterations if reached sufficient precision
+            if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
+                break
+
     # calculate p-value from empirical null-distribution
-    ent_pval = float(null_entropy_ct) / (i+1)
-    vest_pval = float(null_vest_ct) / (i+1)
+    num_sim = j*max_batch + i+1
+    ent_pval = float(null_entropy_ct) / (num_sim)
+    vest_pval = float(null_vest_ct) / (num_sim)
 
     return ent_pval, vest_pval