ZuckermanLab · darianyang · May 23, 2023 · Jan 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+__pycache__
diff --git a/bootstrap.py b/bootstrap.py
diff --git a/example/GetAvg_CR.py b/example/GetAvg_CR.py
@@ -1,42 +1,27 @@
-import scipy
-import re
-from bootstrap import get_CR
-
-
-RunNums = 5
-Rates = {}
-MinVal = 10000    ### We replace all zeros with the smallest value measured
-
-for run in range(1,RunNums+1) : 
-   Rates[run] = []
-   Times = []   # will be overwritten at each iteration, but that's okay
-   RUN="{0:02}".format(run)
-   FileIn = open("rate_"+RUN+".dat")
-   for line in FileIn.readlines() :
-      Words = line.split()
-      if (len(Words)>0) and (re.search("\d", Words[0])) :
-         Times.append(Words[0]) 
-         Rates[run].append(float(Words[1]))
-         if (float(Words[1]) < MinVal) and (float(Words[1]) > 0)  : 
-            MinVal = float(Words[1])
-   FileIn.close()
-
-
-
-FileOut_CR = open("AvgRates_CR.dat", 'w')
-
-for i in range(len(Rates[1])) :
-   print "time step", i+1, "/ ", len(Rates[1])
-   CurrData = []
-   for run in range(1,RunNums+1) :
-      CurrData.append(Rates[run][i])
-   CurrAvg = scipy.average(CurrData)
-
-   for j in range(len(CurrData)) :
-      if CurrData[j] == 0 :  CurrData[j] = MinVal  ### We replace all zeros with the smallest value measured
-
-   [CR_minval,CR_maxval] = get_CR(CurrData, 10000) ### 10000-fold Bayesian bootstrapping performed
-
-   print>>FileOut_CR, Times[i], "\t", CurrAvg, "\t", CR_maxval, "\t", CR_minval 
-
-
+from bootstrap import *
+import numpy as np
+import matplotlib.pyplot as plt
+
+n_runs = 5
+
+# grab rate column (1) only
+rates = [np.loadtxt(f"rate_0{i}.dat")[:,1] for i in range(1, n_runs + 1)]
+# array of the 5 rate arrays
+rates = np.array(rates)
+
+# min non-zero value
+min_val = np.amin(rates[np.nonzero(rates)])
+# replace all zeros with smallest value measured
+rates[rates == 0] = min_val
+
+# get mean array of n replicates at each timepoint
+means = np.average(rates, axis=0)
+# calculate CRs at each timepoint
+CRs = get_CR_multi(rates, 100)
+
+# plotting
+plt.plot(means)
+plt.fill_between([i for i in range(0,rates.shape[1])], CRs[:,0], CRs[:,1], alpha=0.2)
+plt.plot(np.rot90(rates))
+plt.yscale("log", subs=[2, 3, 4, 5, 6, 7, 8, 9])
+plt.show()
diff --git a/example/bootstrap.py b/example/bootstrap.py
@@ -1,76 +1,150 @@
 #!/usr/bin/python
 
-import scipy
+#import scipy
 import random
-
+import numpy as np
+from tqdm.auto import tqdm
 
 ################################################
 ############## DEFINING CI FUNCTION ############
 ################################################
 
-def get_CI (List, Repeat) :
+def get_CI(List, Repeat):
+    """
+    Standard bootstrapping CIs.
+    """
+    if len(set(List)) == 1 :  	# NO CI if List has only identical elements
+        return [0,0]
 
- if len(set(List)) == 1 :  	# NO CI if List has only identical elements
-   return [0,0]
-
- else :
-   AllMeans = []		# List of data(!) sample means for every bootstrap iteration
-   N = len(List)                # Number of data points
-
-   for i in range(Repeat) :	# Repeated bootstrap iterations
-      CurrList = []		# Sample list
-      for j in range(N) :
-         CurrList.append(random.choice(List))
-      AllMeans.append(scipy.average(CurrList))
-
-   perc_min  = scipy.percentile(AllMeans,2.5)	# Minimum percentile defined over list of means 
-   perc_max  = scipy.percentile(AllMeans,97.5) 	# Maximum percentile defined over list of means
-
-   return [perc_min, perc_max]	# Confidence Interval is defined by min/max percentiles of sampling means
-
-################################################
+    else :
+        AllMeans = []		# List of data(!) sample means for every bootstrap iteration
+        N = len(List)                # Number of data points
 
+        for i in range(Repeat) :	# Repeated bootstrap iterations
+            CurrList = []		# Sample list
+            for j in range(N) :
+                CurrList.append(random.choice(List))
+            AllMeans.append(np.average(CurrList))
 
+    perc_min  = np.percentile(AllMeans,2.5)	# Minimum percentile defined over list of means 
+    perc_max  = np.percentile(AllMeans,97.5) 	# Maximum percentile defined over list of means
 
+    return [perc_min, perc_max]	# Confidence Interval is defined by min/max percentiles of sampling means
 
 
 ################################################
 ############## DEFINING CR FUNCTION ############
 ################################################
 
-def get_CR (List, Repeat) :
-
- if len(set(List)) == 1 :  	# NO CR if List has only identical elements
-   return [0,0]
-
- else :
-   AllMeans = []		# List of model(!) sample means for every bootstrap iteration
-   N = len(List)		# Number of data points
-
-   for i in range(Repeat) :	# Repeated bootstrap iterations
-      Rands = [0]		# Following Rubin et al. to get data probabilities from Dirichlet distrib.
-      CurrAvg = 0
-      for j in range(N-1) :
-         Rands.append(random.random()) 
-      Rands.append(1)
-      Rands.sort()
-      P=scipy.diff(Rands)	# List of random numbers that add to 1 and are used as data probabilities
-      for j in range(N) :
-         CurrAvg += P[j]*List[j]	# Sample mean
-      AllMeans.append(CurrAvg)		
-
-   AllMeans.sort()
-   TotalProb = len(AllMeans)
-   CumulProb = 0
-   perc_min  = 0
-   perc_max  = 0
-   for m in AllMeans :	# Iterating through sorted means, identifying that mean at which a certain percentile of probs is reached
-      CumulProb += 1
-      if (CumulProb > 0.025*TotalProb) and (perc_min == 0) :
-         perc_min = m   
-      if (CumulProb > 0.975*TotalProb) and (perc_max == 0):
-         perc_max = m
-
- return [perc_min, perc_max]		# Credibility Region is defined by min/max percentiles of sampling means 
+def get_CR_bm(List, Repeat):
+    """
+    Original bayesian bootstrapping function from BM.
+    """
+    if len(set(List)) == 1 :  	# NO CR if List has only identical elements
+        return [0,0]
+
+    else :
+        AllMeans = []		# List of model(!) sample means for every bootstrap iteration
+        N = len(List)		# Number of data points
+
+        for i in range(Repeat) :	# Repeated bootstrap iterations
+            Rands = [0]		# Following Rubin et al. to get data probabilities from Dirichlet distrib.
+            CurrAvg = 0
+            for j in range(N-1) :
+                Rands.append(random.random()) 
+            Rands.append(1)
+            Rands.sort()
+            P=np.diff(Rands)	# List of random numbers that add to 1 and are used as data probabilities
+            for j in range(N) :
+                CurrAvg += P[j]*List[j]	# Sample mean
+            AllMeans.append(CurrAvg)		
+
+    AllMeans.sort()
+    TotalProb = len(AllMeans)
+    CumulProb = 0
+    perc_min  = 0
+    perc_max  = 0
+    for m in AllMeans :	# Iterating through sorted means, identifying that mean at which a certain percentile of probs is reached
+        CumulProb += 1
+        if (CumulProb > 0.025*TotalProb) and (perc_min == 0) :
+            perc_min = m   
+        if (CumulProb > 0.975*TotalProb) and (perc_max == 0):
+            perc_max = m
+
+    return [perc_min, perc_max]		# Credibility Region is defined by min/max percentiles of sampling means 
 
 #####################################################################
+################## Updated functions using numpy ####################
+#####################################################################
+
+def get_CR_single(rates, repeat):
+    """
+    Get a set of min and max credibility regions (CRs) from bayesian
+    bootstrapping for a 1d array of n_replicates at a single timepoint.
+
+    Parameters
+    ----------
+    rates : 1darray
+        Rates of each replicate at a single timepoint.
+    repeat : int
+        n-fold bayesian bootstrapping.
+
+    Returns
+    -------
+    CRs : 1darray (min CR | max CR)
+        2 item array of calculated min and max CRs for the input rates array.
+    """
+    # check if all elements in the rates array are identical
+    if np.unique(rates).size == 1:
+        return np.array([0, 0])
+
+    else:
+        all_means = np.zeros(repeat)
+
+        # n-fold (n repeat) bayesian bootstrapping
+        for i in range(repeat):
+            # get data probabilities from Dirichlet distrib.
+            rands = np.random.dirichlet(np.ones(len(rates)))
+            # calculate the sample mean using dot product
+            all_means[i] = np.dot(rands, rates)
+
+        # identifying the sorted mean at which a certain percentile of probs is reached
+        all_means.sort()
+        total_prob = repeat
+        cumul_prob = np.arange(1, total_prob + 1)
+        # calculate the percentile values
+        perc_min = all_means[np.argmax(cumul_prob > 0.025 * total_prob)]
+        perc_max = all_means[np.argmax(cumul_prob > 0.975 * total_prob)]
+
+        return np.array([perc_min, perc_max])
+
+def get_CR_multi(rates_multi, repeat):
+    """
+    Calculate the credibility regions of multiple timepoints.
+    Input `rates_multi` array should have n_replicate rows and
+    n_timepoints columns.
+
+    Parameters
+    ----------
+    rates_multi : 2darray
+        Rates for multiple replicates at multiple timepoints.
+    repeats : int
+        n-fold bayesian bootstrapping.
+
+    Returns
+    -------
+    CRs : 2darray
+        Calculated min and max CRs for n_replicates at each timepoint.
+        n_timepoints rows and 2 columns (min CR and max CR).
+    """
+    # CRs will be 2 columns (min and max) and n_frames rows
+    CRs = np.zeros((rates_multi.shape[1], 2))
+    # loop each timepoint, so each set of n_replicate rates, must transpose
+    for i, rep_rates in enumerate(tqdm(rates_multi.T)):
+        CRs[i,:] = get_CR_single(rep_rates, repeat)
+    return CRs
+
+# check the updated numpy version works
+# rates = [0.02, 0.005, 0.033]
+# print(get_CR_bm(rates, 10000))
+# print(get_CR_single(rates, 10000))