Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated to py3 created new CR functions that use np arrays, added some docstrings #2

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.DS_Store
__pycache__
76 changes: 0 additions & 76 deletions bootstrap.py

This file was deleted.

69 changes: 27 additions & 42 deletions example/GetAvg_CR.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,27 @@
import scipy
import re
from bootstrap import get_CR


RunNums = 5
Rates = {}
MinVal = 10000 ### We replace all zeros with the smallest value measured

for run in range(1,RunNums+1) :
Rates[run] = []
Times = [] # will be overwritten at each iteration, but that's okay
RUN="{0:02}".format(run)
FileIn = open("rate_"+RUN+".dat")
for line in FileIn.readlines() :
Words = line.split()
if (len(Words)>0) and (re.search("\d", Words[0])) :
Times.append(Words[0])
Rates[run].append(float(Words[1]))
if (float(Words[1]) < MinVal) and (float(Words[1]) > 0) :
MinVal = float(Words[1])
FileIn.close()



FileOut_CR = open("AvgRates_CR.dat", 'w')

for i in range(len(Rates[1])) :
print "time step", i+1, "/ ", len(Rates[1])
CurrData = []
for run in range(1,RunNums+1) :
CurrData.append(Rates[run][i])
CurrAvg = scipy.average(CurrData)

for j in range(len(CurrData)) :
if CurrData[j] == 0 : CurrData[j] = MinVal ### We replace all zeros with the smallest value measured

[CR_minval,CR_maxval] = get_CR(CurrData, 10000) ### 10000-fold Bayesian bootstrapping performed

print>>FileOut_CR, Times[i], "\t", CurrAvg, "\t", CR_maxval, "\t", CR_minval


from bootstrap import *
import numpy as np
import matplotlib.pyplot as plt

n_runs = 5

# grab rate column (1) only
rates = [np.loadtxt(f"rate_0{i}.dat")[:,1] for i in range(1, n_runs + 1)]
# array of the 5 rate arrays
rates = np.array(rates)

# min non-zero value
min_val = np.amin(rates[np.nonzero(rates)])
# replace all zeros with smallest value measured
rates[rates == 0] = min_val

# get mean array of n replicates at each timepoint
means = np.average(rates, axis=0)
# calculate CRs at each timepoint
CRs = get_CR_multi(rates, 100)

# plotting
plt.plot(means)
plt.fill_between([i for i in range(0,rates.shape[1])], CRs[:,0], CRs[:,1], alpha=0.2)
plt.plot(np.rot90(rates))
plt.yscale("log", subs=[2, 3, 4, 5, 6, 7, 8, 9])
plt.show()
186 changes: 130 additions & 56 deletions example/bootstrap.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,150 @@
#!/usr/bin/python

import scipy
#import scipy
import random

import numpy as np
from tqdm.auto import tqdm

################################################
############## DEFINING CI FUNCTION ############
################################################

def get_CI (List, Repeat) :
def get_CI(List, Repeat):
"""
Standard bootstrapping CIs.
"""
if len(set(List)) == 1 : # NO CI if List has only identical elements
return [0,0]

if len(set(List)) == 1 : # NO CI if List has only identical elements
return [0,0]

else :
AllMeans = [] # List of data(!) sample means for every bootstrap iteration
N = len(List) # Number of data points

for i in range(Repeat) : # Repeated bootstrap iterations
CurrList = [] # Sample list
for j in range(N) :
CurrList.append(random.choice(List))
AllMeans.append(scipy.average(CurrList))

perc_min = scipy.percentile(AllMeans,2.5) # Minimum percentile defined over list of means
perc_max = scipy.percentile(AllMeans,97.5) # Maximum percentile defined over list of means

return [perc_min, perc_max] # Confidence Interval is defined by min/max percentiles of sampling means

################################################
else :
AllMeans = [] # List of data(!) sample means for every bootstrap iteration
N = len(List) # Number of data points

for i in range(Repeat) : # Repeated bootstrap iterations
CurrList = [] # Sample list
for j in range(N) :
CurrList.append(random.choice(List))
AllMeans.append(np.average(CurrList))

perc_min = np.percentile(AllMeans,2.5) # Minimum percentile defined over list of means
perc_max = np.percentile(AllMeans,97.5) # Maximum percentile defined over list of means

return [perc_min, perc_max] # Confidence Interval is defined by min/max percentiles of sampling means


################################################
############## DEFINING CR FUNCTION ############
################################################

def get_CR (List, Repeat) :

if len(set(List)) == 1 : # NO CR if List has only identical elements
return [0,0]

else :
AllMeans = [] # List of model(!) sample means for every bootstrap iteration
N = len(List) # Number of data points

for i in range(Repeat) : # Repeated bootstrap iterations
Rands = [0] # Following Rubin et al. to get data probabilities from Dirichlet distrib.
CurrAvg = 0
for j in range(N-1) :
Rands.append(random.random())
Rands.append(1)
Rands.sort()
P=scipy.diff(Rands) # List of random numbers that add to 1 and are used as data probabilities
for j in range(N) :
CurrAvg += P[j]*List[j] # Sample mean
AllMeans.append(CurrAvg)

AllMeans.sort()
TotalProb = len(AllMeans)
CumulProb = 0
perc_min = 0
perc_max = 0
for m in AllMeans : # Iterating through sorted means, identifying that mean at which a certain percentile of probs is reached
CumulProb += 1
if (CumulProb > 0.025*TotalProb) and (perc_min == 0) :
perc_min = m
if (CumulProb > 0.975*TotalProb) and (perc_max == 0):
perc_max = m

return [perc_min, perc_max] # Credibility Region is defined by min/max percentiles of sampling means
def get_CR_bm(List, Repeat):
"""
Original bayesian bootstrapping function from BM.
"""
if len(set(List)) == 1 : # NO CR if List has only identical elements
return [0,0]

else :
AllMeans = [] # List of model(!) sample means for every bootstrap iteration
N = len(List) # Number of data points

for i in range(Repeat) : # Repeated bootstrap iterations
Rands = [0] # Following Rubin et al. to get data probabilities from Dirichlet distrib.
CurrAvg = 0
for j in range(N-1) :
Rands.append(random.random())
Rands.append(1)
Rands.sort()
P=np.diff(Rands) # List of random numbers that add to 1 and are used as data probabilities
for j in range(N) :
CurrAvg += P[j]*List[j] # Sample mean
AllMeans.append(CurrAvg)

AllMeans.sort()
TotalProb = len(AllMeans)
CumulProb = 0
perc_min = 0
perc_max = 0
for m in AllMeans : # Iterating through sorted means, identifying that mean at which a certain percentile of probs is reached
CumulProb += 1
if (CumulProb > 0.025*TotalProb) and (perc_min == 0) :
perc_min = m
if (CumulProb > 0.975*TotalProb) and (perc_max == 0):
perc_max = m

return [perc_min, perc_max] # Credibility Region is defined by min/max percentiles of sampling means

#####################################################################
################## Updated functions using numpy ####################
#####################################################################

def get_CR_single(rates, repeat):
"""
Get a set of min and max credibility regions (CRs) from bayesian
bootstrapping for a 1d array of n_replicates at a single timepoint.

Parameters
----------
rates : 1darray
Rates of each replicate at a single timepoint.
repeat : int
n-fold bayesian bootstrapping.

Returns
-------
CRs : 1darray (min CR | max CR)
2 item array of calculated min and max CRs for the input rates array.
"""
# check if all elements in the rates array are identical
if np.unique(rates).size == 1:
return np.array([0, 0])

else:
all_means = np.zeros(repeat)

# n-fold (n repeat) bayesian bootstrapping
for i in range(repeat):
# get data probabilities from Dirichlet distrib.
rands = np.random.dirichlet(np.ones(len(rates)))
# calculate the sample mean using dot product
all_means[i] = np.dot(rands, rates)

# identifying the sorted mean at which a certain percentile of probs is reached
all_means.sort()
total_prob = repeat
cumul_prob = np.arange(1, total_prob + 1)
# calculate the percentile values
perc_min = all_means[np.argmax(cumul_prob > 0.025 * total_prob)]
perc_max = all_means[np.argmax(cumul_prob > 0.975 * total_prob)]

return np.array([perc_min, perc_max])

def get_CR_multi(rates_multi, repeat):
"""
Calculate the credibility regions of multiple timepoints.
Input `rates_multi` array should have n_replicate rows and
n_timepoints columns.

Parameters
----------
rates_multi : 2darray
Rates for multiple replicates at multiple timepoints.
repeats : int
n-fold bayesian bootstrapping.

Returns
-------
CRs : 2darray
Calculated min and max CRs for n_replicates at each timepoint.
n_timepoints rows and 2 columns (min CR and max CR).
"""
# CRs will be 2 columns (min and max) and n_frames rows
CRs = np.zeros((rates_multi.shape[1], 2))
# loop each timepoint, so each set of n_replicate rates, must transpose
for i, rep_rates in enumerate(tqdm(rates_multi.T)):
CRs[i,:] = get_CR_single(rep_rates, repeat)
return CRs

# check the updated numpy version works
# rates = [0.02, 0.005, 0.033]
# print(get_CR_bm(rates, 10000))
# print(get_CR_single(rates, 10000))