Skip to content

Commit

Permalink
Merge pull request #32 from ml4sts/docfeatures
Browse files Browse the repository at this point in the history
Docfeatures
  • Loading branch information
brownsarahm authored Apr 19, 2024
2 parents c33d7aa + a8bbfea commit 36e5065
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 55 deletions.
6 changes: 3 additions & 3 deletions docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ parse:
# HTML-specific settings
html:
favicon : "" # A path to a favicon image
use_edit_page_button : false # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in
use_repository_button : false # Whether to add a link to your repository button
use_edit_page_button : true # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in
use_repository_button : true # Whether to add a link to your repository button
use_issues_button : false # Whether to add an "open an issue" button
use_multitoc_numbering : true # Continuous numbering across parts/chapters
extra_navbar : Powered by <a href="https://jupyterbook.org">Jupyter Book</a> # Will be displayed underneath the left navbar.
Expand Down Expand Up @@ -78,7 +78,7 @@ launch_buttons:

repository:
url : https://github.com/ml4sts/ml-sim # The URL to your book's repository
path_to_book : "" # A path to your book's folder, relative to the repository root.
path_to_book : "docs" # A path to your book's folder, relative to the repository root.
branch : main # Which branch of the repository should be used when creating links

#######################################################################################
Expand Down
4 changes: 2 additions & 2 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ parts:
- file: anomaly/simpsons_paradox
- file: anomaly/rate_sp
- file: anomaly/regression_sp
- file: anomaly/api.rst
- file: anomaly/api
# sections:
# - file: path/to/part1/chapter2/section1
- caption: Biases
chapters:
- file: bias/populations
- file: bias/demo_populations
- file: bias/api.rst
- file: bias/api
# sections:
# - file: path/to/part2/chapter2/section1
- caption: Basic Generators
Expand Down
2 changes: 2 additions & 0 deletions docs/anomaly/api.rst → docs/anomaly/api.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
API Reference
=============

```{eval-rst}
.. automodule:: mlsim.anomaly
:members:
```
3 changes: 3 additions & 0 deletions docs/bias/api.rst → docs/bias/api.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
API Reference
=============


```{eval-rst}
.. automodule:: mlsim.bias
:members:
```
10 changes: 9 additions & 1 deletion docs/bias/populations.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,16 @@ The default is not a completely iid and balanced population. All populations are
defined by the following variables: $A$, $Z$, $Y$, $X$. A population has a `sample` method
and attributes for each component sampler.

## Object Structure

```{eval-rst}
.. autoclass:: mlsim.bias.Population
```

This takes one sampler of each factor of the joint data distribution


## Sampling bias

Populations also have samplers that
insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls. For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times.
insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls. For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times.
115 changes: 75 additions & 40 deletions mlsim/bias/bias_components.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
from collections import namedtuple
from collections.abc import Iterable

DemParams = namedtuple('DemParams',['Pa','Pz_a'])
TargetParams = namedtuple('TargetParams',['Py_az'])
Expand Down Expand Up @@ -37,6 +38,8 @@ def __init__(self,rho_a=.5,rho_z=.5):
default is independent sampling of a and z
'''
Pa = [1-rho_a, rho_a]
self.A = [0, 1]

Pz = [1-rho_z, rho_z]
super().__init__((Pa,[Pz,Pz]))

Expand All @@ -56,7 +59,7 @@ def sample(self,N):
a tuple of lenght 2 with elements a and z as column np arrays each
of length N
'''
a = np.random.choice([0,1], p= self.params.Pa, size=N)
a = np.random.choice(self.A, p= self.params.Pa, size=N)
z = [np.random.choice([0,1], p= self.params.Pz_a[ai]) for ai in a]

return np.asarray(a).T,np.asarray(z).T
Expand Down Expand Up @@ -110,30 +113,40 @@ class DemographicCorrelated(Demographic):

def __init__(self,rho_a=.5,rho_z=[.5,.3]):
'''
P(A = 1) = rho_a
P(Z=1,A=0) = rho_z[0]
P(Z=1,A=1) = rho_z[1]
P(A = 1) = rho_a or P(A) = rho_a
P(Z=1|A=i) = rho_z[i]
Parameters
rho_a : scalar or vector of floats
probablity of A = 1 or distribution of A
rho_z : vector of 2 or len(rho_a)
probability Z=1, for A = i
'''
if isinstance(rho_a, Iterable):
Pa = rho_a
self.A = list(range(len(rho_a)))
else:
Pa = [1-rho_a, rho_a]
self.A = [0, 1]

Pz_a = [[1-rho_zi, rho_zi] for rho_zi in rho_z]

'''
Pa = [1-rho_a, rho_a]
Pz_a = [[1-rho_z[0], rho_z[0]],
[1-rho_z[1], rho_z[1]]]
Sampler.__init__(self,(Pa,Pz_a))


class Target(Sampler):
'''
'''
ParamCreator = TargetParams
def __init__(self,beta=0.05):
def __init__(self,beta=0.05,N_a=2):
'''
P(Y=Z|A,Z ) = P(Y=Z) = 1-beta
make errors with prob beta
beta =0, makes Y =Z
'''
pyeqz = [1-beta,beta]
Py_az = [[pyeqz,pyeqz],[pyeqz,pyeqz]]
Py_az = [[pyeqz,pyeqz]]*N_a
super().__init__((Py_az,))


Expand All @@ -147,7 +160,6 @@ def sample(self,a,z):
beta : float
'''

y = [np.random.choice([zi,1-zi],p= self.params.Py_az[ai][zi])
for ai,zi in zip(a,z)]

Expand All @@ -157,15 +169,15 @@ def sample(self,a,z):
class TargetDisadvantagedError(Target):
'''
'''
def __init__(self,beta=.1):
def __init__(self,beta=.1,N_a=2):
'''
make errors with prob beta
make errors with prob beta (advantaged, A=(N_a-1))
P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta
P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1
'''
pyeqz = [1-beta,beta]
Py_az = [[[1,0],[1,0]],[pyeqz,pyeqz]]
Py_az = [[pyeqz, pyeqz]]*(N_a-1) + [[1, 0], [1, 0]]
Sampler.__init__(self,(Py_az,))

class TargetTwoError(Target):
Expand All @@ -183,6 +195,23 @@ def __init__(self,beta=[0,.1]):
Py_az = [[pyz_a0,pyz_a0],[pyz_a1,pyz_a1]]
Sampler.__init__(self,(Py_az,))


class TargetAllAError(Target):
'''
'''

def __init__(self, beta=[0, .1]):
'''
make errors with prob beta
P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta1
P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1-beta0
# '''
# pyz_a0 = [1-beta[0], beta[0]]
# pyz_a1 = [1-beta[1], beta[1]]
Py_az = [[1-betaai, betaai]*2 for betaai in beta]
Sampler.__init__(self, (Py_az,))

class TargetFlipNegative(Target):
'''
'''
Expand All @@ -195,10 +224,10 @@ def __init__(self,beta=[0,.1]):
P(Y=Z|Z =0) = 1
'''
pyz1_a0 = [1-beta[0],beta[0]]
pyz1_a1 = [1-beta[1],beta[1]]
# pyz1_a0 = [1-beta[0],beta[0]]
# pyz1_a1 = [1-beta[1],beta[1]]
no_error = [1,0] # if z=0, P(Y=z) =1
Py_az = [[no_error,pyz1_a0],[no_error,pyz1_a1]]
Py_az = [[no_error, [1-betaai, betaai]] for betaai in beta]
Sampler.__init__(self,(Py_az,))

class TargetFlipAllIndep(Target):
Expand All @@ -214,9 +243,6 @@ def __init__(self,beta=[[.05,.1],[.05,.1]]):
'''
pyz1_a0 = [1-beta[0],beta[0]]
pyz1_a1 = [1-beta[1],beta[1]]
no_error = [1,0] # if z=0, P(Y=z) =1
Py_az = [[[1-b,b] for b in be] for be in beta]
Sampler.__init__(self,(Py_az,))

Expand All @@ -239,7 +265,7 @@ class Feature(Sampler):
'''
ParamCreator = FeatureParams
def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]],
param_tuple = None):
param_tuple = None,N_a =2):
'''
Parameters
----------
Expand All @@ -255,7 +281,8 @@ def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]],
super().__init__(param_tuple)
else:
# default params passed
theta = [[mu,mu],[mu, mu]]
# mu has diffs for Z=0,1; repeat for all A for all Y
theta = [[mu]*N_a]*2
super().__init__((dist,theta))

def sample(self,a,z,y):
Expand All @@ -281,19 +308,23 @@ def sample(self,a,z,y):

if type(self.params.theta[0][0][0]) == tuple:
# if a tuple, then expand and pass 2 params

x = [self.params.distfunc(*self.params.theta[yi][ai][zi])
for ai,zi,yi in zip(z,a,y)]
for ai,zi,yi in zip(a,z,y)]
else:
x = [self.params.distfunc(self.params.theta[yi][ai][zi])
for ai,zi,yi in zip(a,z,y)]
return np.asarray(x)

mvn = lambda mu,var :np.random.multivariate_normal(mu,var*np.eye(len(mu)))

class FeatureSharedParam(Feature):
'''
feature sampler with one parameter shared across Z (eg shared spread)
A and Y have no impact on X
'''
def __init__(self,dist,loc,spread):

def __init__(self, loc, spread, dist=mvn,N_a=2):
'''
unique locations and shared spread for no impact of A or Y
Expand All @@ -309,14 +340,15 @@ def __init__(self,dist,loc,spread):
'''

theta_z = [(li,spread) for li in loc]
theta = [[theta_z,theta_z],[theta_z, theta_z]]
theta = [[theta_z]*N_a]*2
super().__init__(param_tuple=(dist,theta))

class FeatureTwoParams(Feature):
'''
feature sampler with two unique parameters per class
'''
def __init__(self,dist,loc,spread):

def __init__(self, loc, spread, dist=mvn,N_a=2):
'''
unique locations and shared spread for z, no impact of a an y
Expand All @@ -331,8 +363,8 @@ def __init__(self,dist,loc,spread):
spread parameter of dist, one per value of z
'''

theta_z = [(li,si) for li,si in zip(loc,spread)]
theta = [[theta_z,theta_z],[theta_z, theta_z]]
theta_z = [(li, si) for li, si in zip(loc, spread)]
theta = [[theta_z]*N_a]*2
super().__init__(param_tuple=(dist,theta))

class FeaturePerGroupTwoParam(Feature):
Expand All @@ -348,13 +380,17 @@ def __init__(self,dist,loc,spread):
dist : function handle
function to sample X|parameters where the paramters are dependend on
Z,A,Y
loc : list-like length |Z| of lists length 2
loc : list-like length |Z| of lists length |A|
location parameter of dist, one per value of z,a
spread : list-like length |Z| of lists length 2
spread : list-like length |Z| of lists length |A|
spread parameter of dist, one per value of z,a
'''
# '''
# print(len(loc), len(spread))
# print(len(loc[0]), len(spread[0]))
theta_za = [[(lii,sii) for lii,sii in zip(li,si)] for li,si in zip(loc,spread)]
# repeat so that features do not vary with Y
theta = [theta_za,theta_za]
# print(theta)
super().__init__(param_tuple=(dist,theta))

class FeaturePerGroupSharedParamWithinGroup(Feature):
Expand Down Expand Up @@ -416,9 +452,9 @@ def __init__(self,dist,loc,spread):
Parameters
----------
loc : list-like
one location parameter value per true value, protected attribute pair
one location parameter value per (true value, protected attribute) pair
spread : list-like
one spread parameter value per proxy value, protected attribute pair
one spread parameter value per (proxy value, protected attribute) pair
'''
theta_yaz = [[[(lii,sii) for lii,sii in zip(li,si)]
for li in loc] for si in spread]
Expand All @@ -434,16 +470,15 @@ class FeatureNoise(Sampler):
'''
ParamCreator = NoiseParams

def __init__(self,dist= shape_spread_only_mvn,sig = 1.0):
def __init__(self, dist=shape_spread_only_mvn, sig=1.0, N_a=2):
'''
'''
if type(sig) ==float:
# constant noise
theta = [[[sig,sig],[sig,sig]],[[sig,sig],[sig,sig]]]
elif len(sig) ==2:
theta = [[[sig,sig]]*N_a]*2
else:
# diff noise for protected attributes
theta = [[[sig[0],sig[0]],[sig[1],sig[1]]],
[[sig[0],sig[0]],[sig[1],sig[1]]]]
theta = [[sigi,sigi] for sigi in sig]*2

super().__init__((dist,theta))

Expand Down Expand Up @@ -538,7 +573,7 @@ def sample(a,z,y,x,dist,theta):
# functions for combining noise and true vectors
x_a = {0: lambda x,n: np.concatenate((x[:d_noise],n)),
1: lambda x,n: np.concatenate((n, x[d_shared-1:d], x[:d_noise]))}
x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x_z,x_n)]
x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x,x_n)]
x = np.asarray(x)

return x
Expand All @@ -551,7 +586,7 @@ def sample(a,z,y,x,dist,theta):



def feature_proxy(a,z,y,dist,theta):
def feature_proxy(a,z,y,distfunc,theta):
'''
some features are related to the ground truth and some are realated to the
proxy,
Expand Down
6 changes: 3 additions & 3 deletions mlsim/bias/populations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pandas as pd
import aif360.datasets
from aif360.datasets import StructuredDataset
from .bias_components import Demographic, Target, Feature, FeatureNoise

default_params = {'dem':None,}
Expand Down Expand Up @@ -142,11 +142,11 @@ def make_StructuredDataset(self,a,z,y,x):
Returns
--------
aif360.datasets.StructuredDataset
aif360.datasets.StructuredDataset containing the data with y as the target and a as protected attribute.
'''
df = self.make_DataFrame(a,z,y,x)
return aif360.datasets.StructuredDataset(df, ['y'], ['a'])
return StructuredDataset(df, ['y'], ['a'])

def get_parameter_description(self):
'''
Expand Down
Loading

0 comments on commit 36e5065

Please sign in to comment.