lasso-control-variates.py

# -*- coding: utf-8 -*-


import numpy as np
import scipy.stats as spstats
from scipy import signal
import pickle
from multiprocessing import Pool
import multiprocessing

class potentialRegression:
    """ implementing a potential U = logarithm of the posterior distribution
        given by a Bayesian regression
     - Linear
     - Logistic
     - Probit
    """
    
    varY = 1 # Variance of the linear likelihood
    varTheta = 100 # Variance of the prior Gaussian distribution
    
    def __init__(self,Y,X,typ):
        """ initialisation 
        Args:
            Y: observations
            X: covariates
            typ: type of the regression, Linear, Logistic or Probit
        """
        self.Y = Y
        self.X = X
        self.type = typ  
        self.p, self.d = X.shape
    
    def loglikelihood(self,theta):
        """ loglikelihood of the Bayesian regression
        Args:
            theta: parameter of the state space R^d where the likelihood is
                evaluated
        Returns:
            real value of the likelihood evaluated at theta
        """
        if self.type == "g": # Linear regression
            return -(1. / (2*self.varY))* np.linalg.norm(self.Y-np.dot(self.X,theta))**2 \
                        - (self.d/2.)*np.log(2*np.pi*self.varY)
        elif self.type == "l": # Logistic
            XTheta = np.dot(self.X, theta)
            temp1 = np.dot(self.Y, XTheta)
            temp2 = -np.sum(np.log(1+np.exp(XTheta)))
            return temp1+temp2
        else: # Probit
            cdfXTheta = spstats.norm.cdf(np.dot(self.X, theta))
            cdfMXTheta = spstats.norm.cdf(-np.dot(self.X, theta))
            temp1 = np.dot(self.Y, np.log(cdfXTheta))
            temp2 = np.dot((1 - self.Y), np.log(cdfMXTheta))
            return temp1+temp2
    
    def gradloglikelihood(self, theta):
        """ gradient of the loglikelihood of the Bayesian regression
        Args:
            theta: parameter of the state space R^d where the gradient of the
                likelihood is evaluated
        Returns:
            R^d vector of the gradient of the likelihood evaluated at theta
        """
        if self.type == "g": # Linear
            temp1 = np.dot(np.dot(np.transpose(self.X), self.X), theta)
            temp2 = np.dot(np.transpose(self.X), self.Y)
            return (1. / self.varY)*(temp2 - temp1)
        elif self.type == "l": # Logistic
            temp1 = np.exp(np.dot(self.X, theta))
            temp2 = np.dot(np.transpose(self.X), self.Y)
            temp3 = np.dot(np.transpose(self.X), np.divide(temp1, 1+temp1))
            return temp2 - temp3
        else: # Probit
            XTheta = np.dot(self.X, theta)
            logcdfXTheta = np.log(spstats.norm.cdf(XTheta))
            logcdfMXTheta = np.log(spstats.norm.cdf(-XTheta))
            temp1 = np.multiply(self.Y, np.exp(-0.5*(np.square(XTheta)+np.log(2*np.pi)) \
                                               -logcdfXTheta))
            temp2 = np.multiply((1 -self.Y), np.exp(-0.5*(np.square(XTheta)+np.log(2*np.pi)) \
                                               -logcdfMXTheta))
            return np.dot(np.transpose(self.X), temp1-temp2)
            
        
    def logprior(self, theta):
        """ logarithm of the prior distribution, which is a Gaussian distribution
            of variance varTheta
        Args:
            theta: parameter of R^d where the log prior is evaluated
        Returns:
            real value of the log prior evaluated at theta
        """
        return -(1. / (2*self.varTheta))* np.linalg.norm(theta)**2  \
                - (self.d/2.)*np.log(2*np.pi*self.varTheta)
    
    def gradlogprior(self, theta):
        """ gradient of the logarithm of the prior distribution, which is 
            a Gaussian distribution of variance varTheta
        Args:
            theta: parameter of R^d where the gradient log prior is evaluated
        Returns:
            R^d vector of the gradient of the log prior evaluated at theta
        """
        return -(1. / self.varTheta)*theta
    
    def potential(self, theta):
        """ logarithm of the posterior distribution
        Args:
            theta: parameter of R^d where the log posterior is evaluated
        Returns:
            real value of the log posterior evaluated at theta
        """
        return -self.loglikelihood(theta)-self.logprior(theta)
    
    def gradpotential(self, theta):
        """ gradient of the logarithm of the posterior distribution
        Args:
            theta: parameter of R^d where the gradient log posterior is evaluated
        Returns:
            R^d vector of the gradient log posterior evaluated at theta
        """
        return -self.gradloglikelihood(theta)-self.gradlogprior(theta)

""" Samplers ULA, MALA, RWM """
    
def ULA(step, N, n):
    """ MCMC ULA
    Args:
        step: stepsize of the algorithm
        N: burn-in period
        n: number of samples after the burn-in
    Returns:
        traj: a numpy array of size (n, d), where the trajectory is stored
        traj_grad: numpy array of size (n, d), where the gradients of the 
            potential U along the trajectory are stored
    """
    traj = np.zeros((n, d))
    traj_grad = np.zeros((n, d))
    x = np.random.normal(scale=5.0, size=d) # initial value X_0
    for k in np.arange(N): # burn-in period
        x = x - step * potential.gradpotential(x) \
            + np.sqrt(2*step)*np.random.normal(size=d)
    for k in np.arange(n): # samples
        grad = potential.gradpotential(x)
        traj[k,]=x
        traj_grad[k,]=grad
        x = x - step * grad + np.sqrt(2*step)*np.random.normal(size=d)
    return (traj, traj_grad)

def MALA(step, N, n):
    """ MCMC MALA
    Args:
        step: stepsize of the algorithm
        N: burn-in period
        n: number of samples after the burn-in
    Returns:
        traj: a numpy array of size (n, d), where the trajectory is stored
        traj_grad: numpy array of size (n, d), where the gradients of the 
            potential U along the trajectory are stored
    """
    U = potential.potential
    grad_U = potential.gradpotential
    traj = np.zeros((n, d))
    traj_grad = np.zeros((n, d))
    x = np.random.normal(scale=5.0, size=d)
    for k in np.arange(N):
        y = x - step * grad_U(x) + np.sqrt(2*step)*np.random.normal(size=d)
        logratio = -U(y)+U(x) + (1./(4*step))*(np.linalg.norm(y-x+step*grad_U(x))**2 \
                      - np.linalg.norm(x-y+step*grad_U(y))**2)
        if np.log(np.random.uniform())<=logratio:
            x = y
    for k in np.arange(n):
        traj[k,]=x
        traj_grad[k,]=grad_U(x)
        y = x - step * grad_U(x) + np.sqrt(2*step)*np.random.normal(size=d)
        logratio = -U(y)+U(x)+(1./(4*step))*(np.linalg.norm(y-x+step*grad_U(x))**2 \
                      -np.linalg.norm(x-y+step*grad_U(y))**2)
        if np.log(np.random.uniform())<=logratio:
            x = y
    return (traj, traj_grad)

def RWM(step, N, n):
    """ MCMC RWM
    Args:
        step: stepsize of the algorithm
        N: burn-in period
        n: number of samples after the burn-in
    Returns:
        traj: a numpy array of size (n, d), where the trajectory is stored
        traj_grad: numpy array of size (n, d), where the gradients of the 
            potential U along the trajectory are stored
    """
    U = potential.potential
    grad_U = potential.gradpotential # for control variates only
    traj = np.zeros((n, d))
    traj_grad = np.zeros((n, d))
    x = np.random.normal(scale=5.0, size=d)
    for k in np.arange(N):
        y = x + np.sqrt(2*step)*np.random.normal(size=d)
        logratio = -U(y)+U(x)
        if np.log(np.random.uniform())<=logratio:
            x = y
    for k in np.arange(n):
        traj[k,]=x
        traj_grad[k,]=grad_U(x)
        y = x + np.sqrt(2*step)*np.random.normal(size=d)
        logratio = -U(y)+U(x)
        if np.log(np.random.uniform())<=logratio:
            x = y
    return (traj, traj_grad)

""" Control Variates and estimators for mean, asymptotic variance """

def normalSamples(traj,traj_grad):
    """ Computation of the empirical means of \theta and \theta^2, first and
        second order moments, along the trajectory, and of the associated 
        asymptotic variance, using a spectral variance estimator
    Args:
        traj: numpy array (n, d) that contains the trajectory of the MCMC algorithm
        traj_grad: numpy array (n, d) that contrains the gradients of the 
            log posterior evaluated along the trajectory
    Returns:
        mean_samples: numpy array of size 2*d, containing the means of \theta
            and \theta^2
        var_samples: numpy array of size 2*d, containing the asymptotic 
            variances of \theta and \theta^2
    """
    n, d = traj.shape
    samples = np.concatenate((traj, np.square(traj)), axis=1)
    mean_samples = np.mean(samples, axis=0)
    temp1 = samples - mean_samples
    # Batch Means and spectral variance estimators Flegal and Jones, 2010 
    var_samples = np.empty(2*d)
    for i in np.arange(2*d):
        tp1 = (1./n)*signal.fftconvolve(temp1[:,i], temp1[::-1,i], mode="same")
        tp1 = tp1[:(int(n/2)+1)]
        tp1 = tp1[::-1]
        gam0 = tp1[0]
        bn = int(n**(1./2))
        wn = 1./2 + (1./2)*np.cos((np.pi/bn)*np.arange(bn))
        var_samples[i]= -gam0+2*np.dot(wn, tp1[:bn])
    return (mean_samples, var_samples)

def CVpolyOne(traj,traj_grad):
    """ Computation of the control variates estimator based on 1st order
        polynomials, CV1, of \theta and \theta^2, first and
        second order moments, along the trajectory, and of the associated 
        asymptotic variance, using a spectral variance estimator.
    Args:
        traj: numpy array (n, d) that contains the trajectory of the MCMC algorithm
        traj_grad: numpy array (n, d) that contrains the gradients of the 
            log posterior evaluated along the trajectory
    Returns:
        mean_CV1: numpy array of size 2*d, containing CV1 applied with 
            the test functions \theta and \theta^2
        var_CV1: numpy array of size 2*d, containing the asymptotic 
            variances of CV1 applied with the test functions \theta and \theta^2
    """
    n, d = traj.shape
    samples = np.concatenate((traj, np.square(traj)), axis=1)
    covariance = np.cov(np.concatenate((traj, samples), axis=1), rowvar=False)
    paramCV1 = covariance[:d, d:]
    CV1 = samples - np.dot(traj_grad, paramCV1)
    mean_CV1 = np.mean(CV1, axis=0)
    CV1 -= mean_CV1
    var_CV1 = np.empty(2*d)
    for i in np.arange(2*d):
        tp1 = (1./n)*signal.fftconvolve(CV1[:,i], CV1[::-1,i], mode="same")
        tp1 = tp1[:(int(n/2)+1)]
        tp1 = tp1[::-1]
        gam0 = tp1[0]
        bn = int(n**(1./2))
        wn = 1./2 + (1./2)*np.cos((np.pi/bn)*np.arange(bn))
        var_CV1[i]= -gam0+2*np.dot(wn, tp1[:bn])
    return (mean_CV1, var_CV1)

def CVpolyTwo(traj, traj_grad):
    """ Computation of the control variates estimator based on 2nd order
        polynomials, CV2, of \theta and \theta^2, first and
        second order moments, along the trajectory, and of the associated 
        asymptotic variance, using a spectral variance estimator.
    Args:
        traj: numpy array (n, d) that contains the trajectory of the MCMC algorithm
        traj_grad: numpy array (n, d) that contrains the gradients of the 
            log posterior evaluated along the trajectory
    Returns:
        mean_CV2: numpy array of size 2*d, containing CV2 applied with 
            the test functions \theta and \theta^2
        var_CV2: numpy array of size 2*d, containing the asymptotic 
            variances of CV2 applied with the test functions \theta and \theta^2
    """
    n, d = traj.shape
    samples = np.concatenate((traj, np.square(traj)), axis=1)
    poisson = np.zeros((n,int(d*(d+3)/2)))
    poisson[:,np.arange(d)] = traj
    poisson[:,np.arange(d, 2*d)] = np.multiply(traj, traj)
    k = 2*d
    for j in np.arange(d-1):
        for i in np.arange(j+1,d):
            poisson[:,k] = np.multiply(traj[:,i], traj[:,j])
            k=k+1
    Lpoisson = np.zeros((n,int(d*(d+3)/2)))
    Lpoisson[:,np.arange(d)] = - traj_grad
    Lpoisson[:,np.arange(d, 2*d)] = 2*(1. - np.multiply(traj, traj_grad))
    k=2*d
    for j in np.arange(d-1):
        for i in np.arange(j+1,d):
            Lpoisson[:,k] = -np.multiply(traj_grad[:,i], traj[:,j]) \
                    -np.multiply(traj_grad[:,j], traj[:,i])
            k=k+1
    
    cov1 = np.cov(np.concatenate((poisson, -Lpoisson), axis=1), rowvar=False)
    A = np.linalg.inv(cov1[0:int(d*(d+3)/2), int(d*(d+3)/2):d*(d+3)])
    cov2 = np.cov(np.concatenate((poisson, samples),axis=1), rowvar=False)
    B = cov2[0:int(d*(d+3)/2), int(d*(d+3)/2):]
    paramCV2 = np.dot(A,B)
    CV2 = samples + np.dot(Lpoisson, paramCV2)
    mean_CV2 = np.mean(CV2, axis=0)
    CV2 -= mean_CV2
    var_CV2 = np.empty(2*d)
    for i in np.arange(2*d):
        tp1 = (1./n)*signal.fftconvolve(CV2[:,i], CV2[::-1,i], mode="same")
        tp1 = tp1[:(int(n/2)+1)]
        tp1 = tp1[::-1]
        gam0 = tp1[0]
        bn = int(n**(1./2))
        wn = 1./2 + (1./2)*np.cos((np.pi/bn)*np.arange(bn))
        var_CV2[i]= -gam0+2*np.dot(wn, tp1[:bn])
    return (mean_CV2, var_CV2)

def ZVpolyOne(traj, traj_grad):
    """ Computation of the zero variance estimator based on 1st order
        polynomials, ZV1, of \theta and \theta^2, first and
        second order moments, along the trajectory, and of the associated 
        asymptotic variance, using a spectral variance estimator.
    Args:
        traj: numpy array (n, d) that contains the trajectory of the MCMC algorithm
        traj_grad: numpy array (n, d) that contrains the gradients of the 
            log posterior evaluated along the trajectory
    Returns:
        mean_ZV1: numpy array of size 2*d, containing ZV1 applied with 
            the test functions \theta and \theta^2
        var_ZV1: numpy array of size 2*d, containing the asymptotic 
            variances of ZV1 applied with the test functions \theta and \theta^2
    """
    n, d = traj.shape
    samples = np.concatenate((traj, np.square(traj)), axis=1)
    cov1 = np.cov(traj_grad, rowvar=False)
    A = np.linalg.inv(cov1)
    covariance = np.cov(np.concatenate((-traj_grad, samples), axis=1), rowvar=False)
    paramZV1 = -np.dot(A,covariance[:d, d:])
    ZV1 = samples - np.dot(traj_grad, paramZV1)
    mean_ZV1 = np.mean(ZV1, axis=0)
    ZV1 -= mean_ZV1
    var_ZV1 = np.empty(2*d)
    for i in np.arange(2*d):
        tp1 = (1./n)*signal.fftconvolve(ZV1[:,i], ZV1[::-1,i], mode="same")
        tp1 = tp1[:(int(n/2)+1)]
        tp1 = tp1[::-1]
        gam0 = tp1[0]
        bn = int(n**(1./2))
        wn = 1./2 + (1./2)*np.cos((np.pi/bn)*np.arange(bn))
        var_ZV1[i]= -gam0+2*np.dot(wn, tp1[:bn])
    return (mean_ZV1, var_ZV1)

def ZVpolyTwo(traj, traj_grad):
    """ Computation of the zero variance estimator based on 2nd order
        polynomials, ZV2, of \theta and \theta^2, first and
        second order moments, along the trajectory, and of the associated 
        asymptotic variance, using a spectral variance estimator.
    Args:
        traj: numpy array (n, d) that contains the trajectory of the MCMC algorithm
        traj_grad: numpy array (n, d) that contrains the gradients of the 
            log posterior evaluated along the trajectory
    Returns:
        mean_ZV2: numpy array of size 2*d, containing ZV2 applied with 
            the test functions \theta and \theta^2
        var_ZV2: numpy array of size 2*d, containing the asymptotic 
            variances of ZV2 applied with the test functions \theta and \theta^2
    """
    n, d = traj.shape
    samples = np.concatenate((traj, np.square(traj)), axis=1)
    Lpoisson = np.zeros((n,int(d*(d+3)/2)))
    Lpoisson[:,np.arange(d)] = - traj_grad
    Lpoisson[:,np.arange(d, 2*d)] = 2*(1. - np.multiply(traj, traj_grad))
    k=2*d
    for j in np.arange(d-1):
        for i in np.arange(j+1,d):
            Lpoisson[:,k] = -np.multiply(traj_grad[:,i], traj[:,j]) \
                    -np.multiply(traj_grad[:,j], traj[:,i])
            k=k+1
    
    cov1 = np.cov(Lpoisson, rowvar=False)
    A = np.linalg.inv(cov1)
    cov2 = np.cov(np.concatenate((Lpoisson, samples),axis=1), rowvar=False)
    B = cov2[0:int(d*(d+3)/2), int(d*(d+3)/2):]
    paramZV2 = - np.dot(A,B)
    ZV2 = samples + np.dot(Lpoisson, paramZV2)
    mean_ZV2 = np.mean(ZV2, axis=0)
    ZV2 -= mean_ZV2
    var_ZV2 = np.empty(2*d)
    for i in np.arange(2*d):
        tp1 = (1./n)*signal.fftconvolve(ZV2[:,i], ZV2[::-1,i], mode="same")
        tp1 = tp1[:(int(n/2)+1)]
        tp1 = tp1[::-1]
        gam0 = tp1[0]
        bn = int(n**(1./2))
        wn = 1./2 + (1./2)*np.cos((np.pi/bn)*np.arange(bn))
        var_ZV2[i]= -gam0+2*np.dot(wn, tp1[:bn])
    return (mean_ZV2, var_ZV2)

# Logistic/Probit regression

# Data for the logistic regression
# Swiss dataset
data = np.loadtxt("data\\swiss.txt")
Y = data[:,-1]
X = data[:,0:-1]
# Normalization of the covariates
X = np.dot(X - np.mean(X, axis=0), np.diag(1./np.std(X, axis=0)))

potential = potentialRegression(Y, X, "l")
d = potential.d

# Data for the probit regression
# vaso dataset

#data = np.loadtxt("data\\vaso.txt")
#Y = data[:,-1]
#X = data[:,0:-1]
#X = np.dot(X - np.mean(X, axis=0), np.diag(1./np.std(X, axis=0)))
#X = np.insert(X, 0, 1, axis=1)
#
#potential = potentialRegression(Y, X, "p")
#d = potential.d

#-----------------

""" step size
10**(-2) ULA
5*10**(-2) MALA - 0.574 optimal scaling
5*10**(-2) RWM - optimal acceptance rate scaling 0.234
"""

N = 10**5 # Burn in period
n = 10**6 # Number of samples
step= 10**(-2) # Step size
nc = 100 # Number of independent MCMC trajectories

def func(intseed):
    """ generic function that runs a MCMC trajectory
    and computes means and variances for the ordinary samples, 
    CV1, ZV1, CV2 and ZV2 """
    
    np.random.seed(intseed) # random seed, different for each independent
                            # MCMC trajectory (nc trajectories)
    traj, traj_grad = ULA(step, N, n)
    
    # to save the results of the trajectory 
    sauv = np.zeros((2*5,2*d))
    sauv[0,:], sauv[1,:] = normalSamples(traj,traj_grad) # Normal samples
    sauv[2,:], sauv[3,:] = CVpolyOne(traj,traj_grad) # CV1
    sauv[4,:], sauv[5,:] = CVpolyTwo(traj, traj_grad) # CV2
    sauv[6,:], sauv[7,:] = ZVpolyOne(traj,traj_grad) # ZV1
    sauv[8,:], sauv[9,:] = ZVpolyTwo(traj, traj_grad) # ZV2
        
    return sauv

#inputs_seed = np.arange(nc) # input seeds
#
## number of cores exploited for the computation of the independent trajectories
## by deault, all available cores on the machine
#nbcores = multiprocessing.cpu_count()
#
#if __name__ == '__main__':
#    trav = Pool(nbcores)
#    res = trav.map(func, inputs_seed)
#    
#    # Save the result
#    with open('log_ula_nc100_N5_n6.pkl', 'wb') as f:
#        pickle.dump(res, f)

#%% LASSO

traj, traj_grad = ULA(step,N,n)
        
from itertools import cycle
import matplotlib.pyplot as plt

from sklearn.linear_model import lasso_path, enet_path

# ZV2

n, d = traj.shape
#samples = np.concatenate((traj, np.square(traj)), axis=1)
Lpoisson = np.zeros((n,int(d*(d+3)/2)))
Lpoisson[:,np.arange(d)] = - traj_grad
Lpoisson[:,np.arange(d, 2*d)] = 2*(1. - np.multiply(traj, traj_grad))
k=2*d
for j in np.arange(d-1):
    for i in np.arange(j+1,d):
        Lpoisson[:,k] = -np.multiply(traj_grad[:,i], traj[:,j]) \
                -np.multiply(traj_grad[:,j], traj[:,i])
        k=k+1

X = Lpoisson
y = traj[:,0]

X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)

# CV2

n, d = traj.shape
#samples = np.concatenate((traj, np.square(traj)), axis=1)
samples = traj[:,0]
poisson = np.zeros((n,int(d*(d+3)/2)))
poisson[:,np.arange(d)] = traj
poisson[:,np.arange(d, 2*d)] = np.multiply(traj, traj)
k = 2*d
for j in np.arange(d-1):
    for i in np.arange(j+1,d):
        poisson[:,k] = np.multiply(traj[:,i], traj[:,j])
        k=k+1
Lpoisson = np.zeros((n,int(d*(d+3)/2)))
Lpoisson[:,np.arange(d)] = - traj_grad
Lpoisson[:,np.arange(d, 2*d)] = 2*(1. - np.multiply(traj, traj_grad))
k=2*d
for j in np.arange(d-1):
    for i in np.arange(j+1,d):
        Lpoisson[:,k] = -np.multiply(traj_grad[:,i], traj[:,j]) \
                -np.multiply(traj_grad[:,j], traj[:,i])
        k=k+1

cov1 = np.cov(np.concatenate((poisson, -Lpoisson), axis=1), rowvar=False)
XTX = cov1[0:int(d*(d+3)/2), int(d*(d+3)/2):d*(d+3)]
X = np.linalg.cholesky(XTX).T


#A = np.linalg.inv(cov1[0:int(d*(d+3)/2), int(d*(d+3)/2):d*(d+3)])
cov2 = np.cov(np.concatenate((poisson, samples[:,np.newaxis]),axis=1), rowvar=False)
B = cov2[0:int(d*(d+3)/2), int(d*(d+3)/2):].flatten()

y = np.linalg.solve(X.T, B)

# Compute paths

eps = 5e-3  # the smaller it is the longer is the path

print("Computing regularization path using the lasso...")
alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps, fit_intercept=False)

print("Computing regularization path using the positive lasso...")
alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
    X, y, eps, positive=True, fit_intercept=False)
print("Computing regularization path using the elastic net...")
alphas_enet, coefs_enet, _ = enet_path(
    X, y, eps=eps, l1_ratio=0.8, fit_intercept=False)

print("Computing regularization path using the positive elastic net...")
alphas_positive_enet, coefs_positive_enet, _ = enet_path(
    X, y, eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False)

# Display results

plt.figure(1)
ax = plt.gca()

colors = cycle(['b', 'r', 'g', 'c', 'k'])
neg_log_alphas_lasso = -np.log10(alphas_lasso)
neg_log_alphas_enet = -np.log10(alphas_enet)
for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
    l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
    l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)

plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and Elastic-Net Paths')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='lower left')
plt.axis('tight')


plt.figure(2)
ax = plt.gca()
neg_log_alphas_positive_lasso = -np.log10(alphas_positive_lasso)
for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
    l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
    l2 = plt.plot(neg_log_alphas_positive_lasso, coef_pl, linestyle='--', c=c)

plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and positive Lasso')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'positive Lasso'), loc='lower left')
plt.axis('tight')


plt.figure(3)
ax = plt.gca()
neg_log_alphas_positive_enet = -np.log10(alphas_positive_enet)
for (coef_e, coef_pe, c) in zip(coefs_enet, coefs_positive_enet, colors):
    l1 = plt.plot(neg_log_alphas_enet, coef_e, c=c)
    l2 = plt.plot(neg_log_alphas_positive_enet, coef_pe, linestyle='--', c=c)

plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Elastic-Net and positive Elastic-Net')
plt.legend((l1[-1], l2[-1]), ('Elastic-Net', 'positive Elastic-Net'),
           loc='lower left')
plt.axis('tight')
plt.show()