diff --git a/autotest/la_tests.py b/autotest/la_tests.py index f8538c5b..42d7cea2 100644 --- a/autotest/la_tests.py +++ b/autotest/la_tests.py @@ -607,7 +607,8 @@ def ends_freyberg_test(tmp_path): ends = pyemu.EnDS(pst=pst, sim_ensemble=oe, obscov=os.path.join(test_d, "obs.unc"),predictions=predictions) -def ends_freyberg_dsi_test(tmp_path): + +def ends_run_freyberg_dsi(tmp_path,nst=False,nst_extrap=None,ztz=False,energy=1.0): import pyemu import os test_d = "ends_master" @@ -621,58 +622,44 @@ def ends_freyberg_dsi_test(tmp_path): oe_name = pst_name.replace(".pst", ".0.obs.csv") oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] - ends = pyemu.EnDS(pst=pst, sim_ensemble=oe,verbose=True) t_d = os.path.join(tmp_path,"dsi_template") - ends.prep_for_dsi(t_d=t_d) + + ends.prep_for_dsi(t_d=t_d, + use_ztz=ztz, + apply_normal_score_transform=nst, + nst_extrap=nst_extrap, + energy=energy) # copy exe to dsi_template #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe")) - + filename=os.path.join(t_d,"dsi.0.obs.csv") + if os.path.exists(filename): + os.remove(filename) pst = pyemu.Pst(os.path.join(t_d,"dsi.pst")) - pst.control_data.noptmax = 0 + pst.control_data.noptmax = -1 pst.write(os.path.join(t_d,"dsi.pst"),version=2) - #pyemu.os_utils.run("pestpp-ies dsi.pst",cwd="dsi_template") - m_d = os.path.join(tmp_path,"master_dsi") - pyemu.os_utils.start_workers(t_d,"pestpp-ies","dsi.pst",num_workers=15,worker_root=tmp_path, - master_dir=m_d) + pyemu.os_utils.run("pestpp-ies dsi.pst",cwd=t_d) - # run test wtih truncated svd - ends.prep_for_dsi(t_d=t_d,truncated_svd=True) - # copy exe to dsi_template - #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe")) + #read in the results + oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=os.path.join(t_d,"dsi.0.obs.csv")) + assert oe.shape[0]==50, f"{50-oe.shape} failed runs" + phi_vector = oe.phi_vector.sort_values().values + assert phi_vector[0] != phi_vector[1],phi_vector - pst = pyemu.Pst(os.path.join(t_d,"dsi.pst")) - pst.control_data.noptmax = 0 - pst.write(os.path.join(t_d,"dsi.pst"),version=2) - pyemu.os_utils.start_workers(t_d,"pestpp-ies","dsi.pst",num_workers=15,worker_root=tmp_path, - master_dir=m_d) +def ends_freyberg_dsi_test(tmp_path): + ends_run_freyberg_dsi(tmp_path) +def ends_freyberg_dsi_nst_test(tmp_path): + ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap=None) - # run test wtih normal score transform - ends.prep_for_dsi(t_d=t_d,apply_normal_score_transform=True) - # copy exe to dsi_template - #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe")) - - pst = pyemu.Pst(os.path.join(t_d,"dsi.pst")) - pst.control_data.noptmax = 0 - pst.write(os.path.join(t_d,"dsi.pst"),version=2) - pyemu.os_utils.start_workers(t_d,"pestpp-ies","dsi.pst",num_workers=15,worker_root=tmp_path, - master_dir=m_d) - - # run test with log-transform - pst = pyemu.Pst(pst_name) - pst.pestpp_options["predictions"] = predictions - pst.observation_data["obstransform"] = "log" - ends = pyemu.EnDS(pst=pst, sim_ensemble=oe,verbose=True) - ends.prep_for_dsi(t_d=t_d,apply_normal_score_transform=False) - # copy exe to dsi_template - #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe")) - - pst = pyemu.Pst(os.path.join(t_d,"dsi.pst")) - pst.control_data.noptmax = 0 - pst.write(os.path.join(t_d,"dsi.pst"),version=2) - pyemu.os_utils.start_workers(t_d,"pestpp-ies","dsi.pst",num_workers=15,worker_root=tmp_path, - master_dir=m_d) +def ends_freyberg_dsi_extrap_test(tmp_path): + ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap='quadratic') + +def ends_freyberg_dsi_ztz_test(tmp_path): + ends_run_freyberg_dsi(tmp_path,ztz=True) + +def ends_freyberg_dsi_svd_test(tmp_path): + ends_run_freyberg_dsi(tmp_path,ztz=True,energy=0.999) def plot_freyberg_dsi(): @@ -738,15 +725,28 @@ def dsi_normscoretransform_test(): oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] nstval = randrealgen_optimized(oe.shape[0], 1e-7, 1e4) + window_size=3 + if oe.shape[0]>40: + window_size=5 + if oe.shape[0]>90: + window_size=7 + if oe.shape[0]>200: + window_size=9 for name in oe.columns: print("transforming:",name) sorted_values = oe._df.loc[:,name].sort_values().copy() + #if all values are the same, skip + if sorted_values.iloc[0] == sorted_values.iloc[-1]: + print("all values are the same, skipping") + continue + sorted_values.loc[:] = pyemu.eds.moving_average_with_endpoints(sorted_values.values, window_size) transformed_values = np.asarray([normal_score_transform(nstval, sorted_values, value)[0] for value in sorted_values]) backtransformed_values = np.asarray([inverse_normal_score_transform(nstval, sorted_values, value)[0] for value in transformed_values]) diff = backtransformed_values-sorted_values assert max(abs(diff))<1e-7, backtransformed_values + if __name__ == "__main__": #dsi_normscoretransform_test() #ends_freyberg_dev() diff --git a/pyemu/eds.py b/pyemu/eds.py index 49ed06e3..298c813c 100644 --- a/pyemu/eds.py +++ b/pyemu/eds.py @@ -11,7 +11,7 @@ from pyemu.mat.mat_handler import Matrix, Jco, Cov from pyemu.pst.pst_handler import Pst from pyemu.utils.os_utils import _istextfile,run -from pyemu.utils.helpers import normal_score_transform,randrealgen_optimized,inverse_normal_score_transform +from pyemu.utils.helpers import normal_score_transform,randrealgen_optimized from .logger import Logger @@ -496,7 +496,9 @@ def get_posterior_prediction_moments(self, obslist_dict=None,sim_ensemble=None,i return mean_dfs,dfstd,dfper - def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_transform=False,truncated_svd=False): + def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template", + apply_normal_score_transform=False,nst_extrap=None, + use_ztz=False,energy=1.0): """Setup a new PEST interface for the data-space inversion process. If the observation data in the Pst object has a "obstransform" column, then observations for which "log" is specified will be subject to log-transformation. If the `apply_normal_score_transform` flag is set to `True`, then the observations and predictions will be subject to a normal score transform. @@ -508,8 +510,11 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr t_d (`str`): template directory to setup the DSI model + pest files in. Default is `dsi_template` apply_normal_score_transform (`bool`): flag to apply a normal score transform to the observations and predictions. Default is `False` - truncated_svd (`bool`): flag to use a truncated SVD for the pseudo-inverse of the deviations matrix. + nst_extrap (`str`): flag to apply extrapolation to the normal score transform. Can be None, 'linear' or 'quadratic'. Default is None. + use_ztz (`bool`): flag to use the condensed ZtZ matrix for SVD. The ZtZ matrix has dimensions nreal*nreal, instead of the nreal*nobs dimensions of Z. + This makes the SVD computation faster and more memory efficient when nobs >> nreal. Default is `False` + energy (`float`): energy threshold for truncating the sqrt(C) matrix. Default is `1.0` which applys no truncation. Example:: @@ -525,12 +530,15 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr if sim_ensemble is None: sim_ensemble = self.sim_ensemble.copy() + if nst_extrap is not None: + assert nst_extrap in ["linear","quadratic"], "nst_extrap must be None, 'linear' or 'quadratic'" + if os.path.exists(t_d): self.logger.warn("EnDS.prep_for_dsi(): t_d '{0}' exists, removing...".format(t_d)) shutil.rmtree(t_d) os.makedirs(t_d) - self.logger.log("getting deviations") + nz_names = self.pst.nnz_obs_names snz_names = set(nz_names) z_names = [n for n in self.pst.obs_names if n not in snz_names] @@ -538,39 +546,68 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr names.extend(nz_names) names.sort() + # make sure names are sorted + sim_ensemble = sim_ensemble.loc[:,names] + + self.logger.log("applying transformations") # implement log-transform/offset and normal score transform transf_names = nz_names.copy() transf_names.extend(self.predictions) if "obstransform" in self.pst.observation_data.columns: - obs = self.pst.observation_data.loc[transf_names].copy() + obs = self.pst.observation_data.copy() + #make sure names are ordered + obs = obs.loc[names,:] + #TODO: deal with "scale" and user-specified "offset" obs["offset"] = 0.0 #TODO: more elegant? incase all 'none' are passed... obsnmes = obs.loc[obs.obstransform=='log'].obsnme.values if len(obsnmes) > 0: for name in obsnmes: #TODO: make more efficient self.logger.log("applying obs log-transform to:"+name) - values = sim_ensemble.loc[:,name].values + values = sim_ensemble.loc[:,name].astype(float).values offset = abs(min(values))+1.0 #arbitrary; enforce positive values values+=offset assert min(values)>0, "values must be positive. min value is "+str(min(values)) sim_ensemble.loc[:,name] = np.log10(values) obs.loc[obs.obsnme==name,'offset'] = offset obs[['obsnme','obsval','obstransform','offset']].to_csv(os.path.join(t_d,"dsi_obs_transform.csv"),index=False) - + #numpy binary for i/o speed + np.save(os.path.join(t_d,"dsi_obs_offset.npy"), + obs.offset.values, + allow_pickle=False, fix_imports=True) + obs['flag'] = 0 + obs.loc[obs.obstransform=='log', "flag"] = 1 + np.save(os.path.join(t_d,"dsi_obs_log.npy"), + obs.flag.values, + allow_pickle=False, fix_imports=True) + if apply_normal_score_transform: # prepare for normal score transform - nstval = randrealgen_optimized(sim_ensemble.shape[0], 1e-7, 1e4) - + nstval = randrealgen_optimized(sim_ensemble.shape[0]) back_transform_df = pd.DataFrame() self.logger.log("applying normal score transform to non-zero obs and predictions") #TODO: make more efficient - for name in transf_names: + for name in transf_names: print("transforming:",name) values = sim_ensemble._df.loc[:,name].copy() values.sort_values(inplace=True) - transformed_values = [normal_score_transform(nstval, values.values, v)[0] for v in values.values] - #transformed_values, sorted_values, sorted_idxs = normal_score_transform(values) #transformed data retains the same order as the original data + if values.iloc[0] != values.iloc[-1]: + # apply smoothing as per DSI2; window sizes are arbitrary... + window_size=3 + if values.shape[0]>40: + window_size=5 + if values.shape[0]>90: + window_size=7 + if values.shape[0]>200: + window_size=9 + #print("window size:",window_size,values.shape[0]) + values.loc[:] = moving_average_with_endpoints(values.values, window_size) + transformed_values = [normal_score_transform(nstval, values.values, v)[0] for v in values.values] + #transformed_values, sorted_values, sorted_idxs = normal_score_transform(values) #transformed data retains the same order as the original data + elif values.iloc[0] == values.iloc[-1]: + print("all values are the same, skipping nst") + transformed_values = values.values sim_ensemble.loc[values.index,name] = transformed_values df = pd.DataFrame() df['real'] = values.index @@ -579,27 +616,31 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr df['nstval'] = nstval df['obsnme'] = name back_transform_df=pd.concat([back_transform_df,df],ignore_index=True) - back_transform_df.to_csv(os.path.join(t_d,"dsi_obs_backtransform.csv"),index=False) - - Z = sim_ensemble.get_deviations() / np.sqrt(float(sim_ensemble.shape[0] - 1)) - Z = Z.loc[:,names] - - self.logger.log("getting deviations") - - self.logger.log("pseudo inv of deviations matrix") + #back_transform_df.to_csv(os.path.join(t_d,"dsi_obs_backtransform.csv"),index=False) + #numpy binary for speed + np.save(os.path.join(t_d,"dsi_obs_backtransformvals.npy"), + back_transform_df[['sorted_values',"nstval"]].values, + allow_pickle=False, fix_imports=True) + np.save(os.path.join(t_d,"dsi_obs_backtransformobsnmes.npy"), + back_transform_df['obsnme'].values, + allow_pickle=True, fix_imports=True) + + self.logger.log("applying transformations") - if truncated_svd: - deltad = Matrix.from_dataframe(Z) - U, S, V = deltad.pseudo_inv_components(maxsing=Z.shape[0], - eigthresh=1e-5) - S = S.x - V = V.x + self.logger.log("computing projection matrix") + if use_ztz: + self.logger.log("using ztz approach...") + pmat, s = compute_using_ztz(sim_ensemble) + self.logger.log("using ztz approach...") else: - U, S, V = np.linalg.svd(Z.values, full_matrices=False) - V = V.transpose() - S = np.diag(S) - self.logger.log("pseudo inv of deviations matrix") + self.logger.log("using z approach...") + pmat, s = compute_using_z(sim_ensemble) + self.logger.log("using z approach...") + self.logger.log("computing projection matrix") + self.logger.log("applying truncation...") + apply_energy_based_truncation(energy,s,pmat) + self.logger.log("applying truncation...") self.logger.log("creating tpl files") dsi_in_file = os.path.join(t_d, "dsi_pars.csv") @@ -609,7 +650,7 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr ftpl.write("ptf ~\n") fin.write("parnme,parval1\n") ftpl.write("parnme,parval1\n") - npar = S.shape[0] + npar = s.shape[0] dsi_pnames = [] for i in range(npar): pname = "dsi_par{0:04d}".format(i) @@ -620,6 +661,8 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr ftpl.close() mn_vec = sim_ensemble.mean(axis=0) + # check that sim_ensemble has names ordered + assert (mn_vec.index.values == names).all(), "sim_ensemble names are not ordered" mn_in_file = os.path.join(t_d, "dsi_pr_mean.csv") mn_tpl_file = mn_in_file + ".tpl" fin = open(mn_in_file, 'w') @@ -638,15 +681,16 @@ def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",apply_normal_score_tr self.logger.log("creating tpl files") self.logger.log("saving proj mat") - - #pmat = U * S - pmat = np.dot(V, S) #row_names = ["sing_vec_{0}".format(i) for i in range(pmat.shape[0])] pmat = Matrix(x=pmat,col_names=dsi_pnames,row_names=names) pmat.col_names = dsi_pnames - proj_name = "dsi_proj_mat.jcb" # dont change this name!!! + #proj_name = "dsi_proj_mat.jcb" # dont change this name!!! + proj_name = "dsi_proj_mat.npy" # dont change this name!!! proj_path = os.path.join(t_d,proj_name) - pmat.to_coo(proj_path) + #pmat.to_coo(proj_path) + # use numpy for speed + np.save(os.path.join(t_d,proj_name), pmat.x, allow_pickle=False, fix_imports=True) + self.logger.statement("projection matrix dimensions:"+str(pmat.shape)) self.logger.statement("projection matrix saved to "+proj_path) self.logger.log("saving proj mat") @@ -657,35 +701,37 @@ def dsi_forward_run(): import os import numpy as np import pandas as pd - from pyemu.mat.mat_handler import Matrix from pyemu.utils.helpers import inverse_normal_score_transform - pmat = Matrix.from_binary("dsi_proj_mat.jcb") + pmat = np.load("dsi_proj_mat.npy") pvals = pd.read_csv("dsi_pars.csv",index_col=0) - pvals = pvals.loc[pmat.col_names,:] ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0) - ovals = ovals.loc[pmat.row_names,:] - sim_vals = ovals + np.dot(pmat.x,pvals.values) - filename = "dsi_obs_backtransform.csv" + sim_vals = ovals + np.dot(pmat,pvals.values) + filename = "dsi_obs_backtransformvals.npy" if os.path.exists(filename): - back_transform_df = pd.read_csv(filename) print("applying back-transform") - obsnmes = back_transform_df.obsnme.unique() - back_vals = [inverse_normal_score_transform( - back_transform_df.loc[back_transform_df['obsnme']==o,'nstval'].values, - back_transform_df.loc[back_transform_df['obsnme']==o,'sorted_values'].values, - sim_vals.loc[o].mn + backtransformvals = np.load("dsi_obs_backtransformvals.npy") + backtransformobsnmes = np.load("dsi_obs_backtransformobsnmes.npy",allow_pickle=True) + obsnmes = np.unique(backtransformobsnmes) + back_vals = [ + inverse_normal_score_transform( + backtransformvals[np.where(backtransformobsnmes==o)][:,1], + backtransformvals[np.where(backtransformobsnmes==o)][:,0], + sim_vals.loc[o].mn, + extrap=None )[0] - for o in obsnmes] + for o in obsnmes + ] sim_vals.loc[obsnmes,'mn'] = back_vals if os.path.exists("dsi_obs_transform.csv"): print("reversing log-transform") - obs = pd.read_csv("dsi_obs_transform.csv") - obsnmes = obs.loc[obs.obstransform=='log'].obsnme.unique() - back_vals = [ - (10**sim_vals.loc[x].mn) - obs.loc[obs.obsnme==x,'offset'].values[0] - for x in obsnmes - ] - sim_vals.loc[obsnmes,'mn'] = back_vals + offset = np.load("dsi_obs_offset.npy") + log_trans = np.load("dsi_obs_log.npy") + assert log_trans.shape[0] == sim_vals.mn.values.shape[0], f"log transform shape mismatch: {log_trans.shape[0]},{sim_vals.mn.values.shape[0]}" + assert offset.shape[0] == sim_vals.mn.values.shape[0], f"offset transform shape mismatch: {offset.shape[0]},{sim_vals.mn.values.shape[0]}" + vals = sim_vals.mn.values + vals[np.where(log_trans==1)] = 10**vals[np.where(log_trans==1)] + vals-= offset + sim_vals.loc[:,'mn'] = vals print(sim_vals) sim_vals.to_csv("dsi_sim_vals.csv") @@ -713,9 +759,16 @@ def dsi_forward_run(): par = pst.parameter_data dsi_pars = par.loc[par.parnme.str.startswith("dsi_par"),"parnme"] par.loc[dsi_pars,"parval1"] = 0 - par.loc[dsi_pars,"parubnd"] = 2.5 - par.loc[dsi_pars,"parlbnd"] = -2.5 + par.loc[dsi_pars,"parubnd"] = 10.0 + par.loc[dsi_pars,"parlbnd"] = -10.0 par.loc[dsi_pars,"partrans"] = "none" + with open(os.path.join(t_d,"dsi.unc"),'w') as f: + f.write("START STANDARD_DEVIATION\n") + for p in dsi_pars: + f.write("{0} 1.0\n".format(p)) + f.write("END STANDARD_DEVIATION") + pst.pestpp_options['parcov'] = "dsi.unc" + mn_pars = par.loc[par.parnme.str.startswith("dsi_prmn"),"parnme"] par.loc[mn_pars,"partrans"] = "fixed" for pname,pval in mn_dict.items(): @@ -731,11 +784,13 @@ def dsi_forward_run(): pst.model_command = "python forward_run.py" self.logger.log("creating Pst") import inspect + #print([l for l in inspect.getsource(dsi_forward_run).split("\n")]) lines = [line[12:] for line in inspect.getsource(dsi_forward_run).split("\n")][1:] - - with open(os.path.join(t_d,"forward_run.py"),'w') as f: for line in lines: + if nst_extrap is not None: + if "extrap=None" in line: + line = line.replace("None",f"'{nst_extrap}'") f.write(line+"\n") pst.write(os.path.join(t_d,"dsi.pst"),version=2) self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst"))) @@ -747,14 +802,107 @@ def dsi_forward_run(): return pst +def compute_using_z(sim_ensemble): + z = sim_ensemble.get_deviations() / np.sqrt(float(sim_ensemble._df.shape[0] - 1)) + z = z.values + u, s, v = np.linalg.svd(z, full_matrices=False) + us = np.dot(v.T, np.diag(s)) + return us,s + +def compute_using_ztz(sim_ensemble): + # rval are the transformed obs values + rval = sim_ensemble._df.copy() + #mu2 is the mean of the transformed obs values + mu2 = rval.mean() + #adjust rval by subtracting mu2 + rval -= mu2 + #divide rval by the sqrt of nreal-1 + nreal = rval.shape[0] + rval = rval*np.sqrt(1/(nreal-1)) + # rval.T to match pest utils implementation + z = rval.T.values + # Compute the ZtZ matrix + ztz = np.dot(z.T,z) + assert ztz.shape[0] == z.shape[1], "ZtZ matrix is not square" + assert ztz.shape[0] == sim_ensemble.shape[0], "ZtZ matrix is not nreal*nreal" + + #We now do SVD on ZtZ. + print("doing SVD on ZtZ") + u, s2, v = np.linalg.svd(ztz, full_matrices=False) + s = np.sqrt(s2) + s[z.shape[0]:] = 0 #truncation to match compute_using_z() + + # formulate the sqrt of the covariance matrix + us = np.dot(z,u) + return us, s + +def apply_energy_based_truncation(energy,s,us): + if energy >= 1.0: + print("Warning: energy>=1.0, no truncation applied") + return us + # Determine where to truncate + # Determine nn + if us.shape[0]==us.shape[1]: + nn = us.shape[0] - 1 + else: + nobs = us.shape[0] + nreal = us.shape[1] + nn = min(nobs, nreal) - 1 + # Compute total_energy + total_energy = np.sum((np.sqrt(s))[:nn]) + # Find energy truncation point + ntrunc = np.where((np.sqrt(s)).cumsum()/total_energy<=energy)[0].shape[0] + # Initialize threshold + #s1 = s[0] + #thresh = 1.0e-7 * s1 #NOTE: JDoh's implementation uses an additional level of truncation + #ntrunc = min(np.where(s>=thresh)[0][0], ntrunc)+1 + ntrunc=ntrunc+1 + if ntrunc>=us.shape[1]: + print("ntrunc>=us.shape[1], no truncation applied") + else: + print("truncating to {0} singular values".format(ntrunc)) + # Apply threshold logic + us = us[:,:ntrunc] + return us + +def moving_average_with_endpoints(y_values, window_size): + # Ensure the window size is odd + if window_size % 2 == 0: + raise ValueError("window_size must be odd") + # Calculate half-window size + half_window = window_size // 2 + # Initialize the output array + smoothed_y = np.zeros_like(y_values) + # Handle the endpoints + for i in range(0,half_window): + # Start + smoothed_y[i] = np.mean(y_values[:i + half_window ]) + for i in range(1,half_window+1): + # End + smoothed_y[-i] = np.mean(y_values[::-1][:i + half_window +1]) + # Handle the middle part with full window + for i in range(half_window, len(y_values) - half_window): + smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window]) + #Enforce endoints + smoothed_y[0] = y_values[0] + smoothed_y[-1] = y_values[-1] + # Ensure uniqueness by adding small increments if values are duplicated + #NOTE: this is a hack to ensure uniqueness in the normal score transform + smoothed_y = make_unique(smoothed_y, delta=1e-10) + return smoothed_y + + +def make_unique(arr, delta=1e-10): + """ + Modifies a sorted numpy array in-place to ensure all elements are unique. + + Parameters: + arr (np.ndarray): The sorted numpy array. + delta (float): The minimum increment to apply to duplicate elements. + Default is a very small value (1e-10). + """ + for i in range(1, len(arr)): + if arr[i] <= arr[i - 1]: + arr[i] = arr[i - 1] + delta - - - - - - - - - - + return arr diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index 07ff93b1..1b73b6b8 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4017,8 +4017,11 @@ def randrealgen_optimized(nreal, tol=1e-7, max_samples=1000000): """ rval = np.zeros(nreal) nsamp = 0 - numsort = (nreal + 1) // 2 - + # if nreal is even add 1 + if nreal % 2 == 0: + numsort = (nreal + 1) // 2 + else: + numsort = nreal // 2 while nsamp < max_samples: nsamp += 1 work1 = np.random.normal(size=nreal) @@ -4036,7 +4039,10 @@ def randrealgen_optimized(nreal, tol=1e-7, max_samples=1000000): rval[:numsort] = work1[:numsort] rval[:numsort] /= nsamp - rval[numsort:] = -rval[:numsort][::-1] + if nreal % 2 == 0: + rval[numsort:] = -rval[:numsort][::-1] + else: + rval[numsort+1:] = -rval[:numsort][::-1] return rval @@ -4064,10 +4070,10 @@ def normal_score_transform(nstval, val, value): nstval = np.asarray(nstval) # if the value is outside the range of the table, return the first or last value - if value <= val[0]: - return nstval[0], 0 - elif value >= val[-1]: - return nstval[-1], len(val) + assert value >= val[0], "Value is below the minimum value in the table." + assert value <= val[-1], "Value is greater than the maximum value in the table." + # ensure that val is sorted + assert np.all(np.diff(val) > 0), f"Values in the table must be sorted in ascending order:{list(zip(np.diff(val)>0,val))}" # find the rank of the value in the table rank = np.searchsorted(val, value, side='right') - 1 @@ -4087,23 +4093,31 @@ def normal_score_transform(nstval, val, value): def inverse_normal_score_transform(nstval, val, value, extrap='quadratic'): nreal = len(val) + # check that nstval is sorted + assert np.all(np.diff(nstval) > 0), "Values in the table must be sorted in ascending order" + # check that val is sorted + assert np.all(np.diff(val) > 0), "Values in the table must be sorted in ascending order" def linear_extrapolate(x0, y0, x1, y1, x): if x1 != x0: return y0 + (y1 - y0) / (x1 - x0) * (x - x0) return y0 - def quadratic_extrapolate(x0, y0, x1, y1, x2, y2, x): - denom = (x0 - x) * (x1 - x) * (x2 - x) - if denom == 0: - print(x, x0, x1, x2) + def quadratic_extrapolate(x1, y1, x2, y2, x3, y3, x4): + y12=y1-y2 + x23=x2-x3 + y23=y2-y3 + x12=x1-x2 + x13=x1-x3 + if x12==0 or x23==0 or x13==0: raise ValueError("Input x values must be distinct") - - a = ((x - x1) * (y2 - y1) - (x - x2) * (y1 - y0)) / denom - b = ((x - x2) * (y1 - y0) - (x - x0) * (y2 - y1)) / denom - c = y0 - a * x0**2 - b * x0 - y = a * x**2 + b * x + c - return y + a = (y12*x23-y23*x12) + den = x12*x23*x13 + a = a/den + b = y23/x23 - a*(x2+x3) + c=y1-x1*(a*x1+b) + y4 = a*x4**2 + b*x4 + c + return y4 ilim = 0 if value in nstval: @@ -4116,7 +4130,7 @@ def quadratic_extrapolate(x0, y0, x1, y1, x2, y2, x): value = val[0] elif extrap == 'linear': value = linear_extrapolate(nstval[0], val[0], nstval[1], val[1], value) - value = min(value, val[0]) + #value = min(value, val[0]) elif extrap == 'quadratic' and nreal >= 3: y_vals = np.unique(val)[:3] idxs = np.searchsorted(val,y_vals) @@ -4132,7 +4146,7 @@ def quadratic_extrapolate(x0, y0, x1, y1, x2, y2, x): value = val[-1] elif extrap == 'linear': value = linear_extrapolate(nstval[-2], val[-2], nstval[-1], val[-1], value) - value = max(value, val[-1]) + #value = max(value, val[-1]) elif extrap == 'quadratic' and nreal >= 3: y_vals = np.unique(val)[-3:] idxs = np.searchsorted(val,y_vals) @@ -4144,13 +4158,11 @@ def quadratic_extrapolate(x0, y0, x1, y1, x2, y2, x): else: rank = np.searchsorted(nstval, value) - 1 - nstdiff = nstval[rank + 1] - nstval[rank] - diff = val[rank + 1] - val[rank] - if nstdiff <= 0.0 or diff <= 0.0: - value = val[rank] - else: - nstdist = value - nstval[rank] - value = val[rank] + (nstdist / nstdiff) * diff + # Get the bounding x and y values + x0, x1 = nstval[rank], nstval[rank + 1] + y0, y1 = val[rank], val[rank + 1] + # Perform linear interpolation + value = y0 + (y1 - y0) * (value - x0) / (x1 - x0) return value, ilim