-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGaussian_data.py
131 lines (93 loc) · 3.92 KB
/
Gaussian_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#Python code to generate Fig. 2, 3, 4, points. This uses a slightly different notation a,b for the coefficients alpha, r in the paper. alpha=b, r=(a-1)/2b
from math import*
import numpy as np
import pandas as pd
import argparse
import multiprocessing as mp
from sklearn import linear_model
parser = argparse.ArgumentParser(description='Job launcher')
parser.add_argument("-a", type=float)#a
parser.add_argument("-b", type=float)#b
parser.add_argument("-c",type=float)#c
parser.add_argument("-s", type=float)#logsigma
parser.add_argument("-p", type=int)#feature space dim
parser.add_argument("-v", type=int)#samples
parser.add_argument("-o", type=str)#CV or noreg or decay
parser.add_argument("-l", type=str)#either "use_replica_lambda" or "sklearn"
args = parser.parse_args()
a=args.a
b=args.b
c=args.c
p=int(args.p)
spec_Omega = np.array([p/(k+1)**b for k in range(p)])
Phi = spec_Omega
Psi = spec_Omega
sigma=10**(args.s)
teacher = np.sqrt(np.array([p/(k+1)**a for k in range(p)]) / spec_Omega)
rho = np.mean(Psi * teacher**2)
diagUtPhiPhitU = Phi**2 * teacher**2
if args.o=="CV":
CV=True
else:
CV=False
if args.o=="CV":
rep=pd.read_csv("./Data/Replica/Noisy_opt_lambda_ridge_decay_a{}_b{}.csv".format(a,b))
elif args.o=="no_reg":
rep=pd.read_csv("./Data/Replica/Noisy_opt_lambda_ridge_decay_a{}_b{}_noreg.csv".format(a,b))
elif args.o=="decay":
rep=pd.read_csv("./Data/Replica/Noisy_opt_lambda_ridge_decay_a{}_b{}_c{}.csv".format(a,b,c))
rep=rep[(np.abs(rep["sigma"]/sigma-1)<0.01)& (rep["a"]==a) & (rep["b"]==b)]
samples=int(round(np.array(rep["samples"])[int(args.v)]))
use_replica_=args.l
if use_replica_=="use_replica_lambda":
use_replica_lambda=True
else:
use_replica_lambda=False
lamb=rep[(np.abs(rep["samples"]-samples)<1)]["lamb"].mean()
def get_samples( samples, teacher,seed):
np.random.seed(seed)
X = np.sqrt(spec_Omega) * np.random.normal(0,1,(samples, p))
np.random.seed(seed+10)
y = X @ teacher / np.sqrt(p)+np.random.normal(0,1,(samples,))*sigma
return X/np.sqrt(p), y
def ridge_estimator(X, y, lamb=0.1):
'''
Implements the pseudo-inverse ridge estimator.
'''
m, n = X.shape
if m >= n:
return np.linalg.inv(X.T @ X + lamb*np.identity(n)) @ X.T @ y
elif m < n:
return X.T @ np.linalg.inv(X @ X.T + lamb*np.identity(m)) @ y
def get_errors(samples, teacher,seed):
X_train, y_train = get_samples(samples,
teacher,seed)
X_test, y_test = get_samples(samples,
teacher,3*seed)
if use_replica_lambda==True or CV==False:
w = ridge_estimator(X_train, y_train, lamb=lamb)
yhat_train = X_train @ w
yhat_test = X_test @ w
lamb_opt=lamb
else:
reg = linear_model.RidgeCV(alphas=np.hstack((np.logspace(-5,1.5,100),np.array([0]))))
reg.fit(X_train,y_train)
yhat_test=reg.predict(X_test)
yhat_train=reg.predict(X_train)
lamb_opt=reg.alpha_
train_error = np.mean((y_train - yhat_train)**2)
test_error = np.mean((y_test - yhat_test)**2)-sigma**2 #error gap
return [train_error, test_error, a,b,c,lamb_opt,samples,samples/p,sigma,p,seed]
pool = mp.Pool(mp.cpu_count())
results = pool.starmap_async(get_errors, [(samples,teacher,s) for s in range(0,50)]).get()
pool.close()
results=np.array(results)
Df=pd.DataFrame(data=np.array(results),columns=["train_error","test_error","a","b","c","lambda","samples","sample_complexity","sigma","p","seed"])
if args.o!="decay":
if use_replica_lambda==False:
filename="./Data/Artificial/sklearn/Simus_a{}_b{}_samples{}_{}.csv".format(a,b,sigma,samples,args.o)
else:
filename="./Data/Artificial/{}/Simus_a{}_b{}_sigma{}_samples{}_{}.csv".format(args.o,a,b,sigma,samples,args.o)
else:
filename="./Data/Artificial/{}/Simus_a{}_b{}_c{}_sigma{}_samples{}_{}.csv".format(args.o,a,b,c,sigma,samples,args.o)
Df.to_csv(filename)