From d84962d9a62ae3772efc3f44de671fd91e28ebb1 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Fri, 10 Feb 2023 16:37:29 -0500 Subject: [PATCH 1/5] fix doc build --- docs/bias/populations.md | 10 +++++++++- mlsim/bias/bias_components.py | 4 ++-- mlsim/bias/populations.py | 6 +++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/bias/populations.md b/docs/bias/populations.md index cd5771b..90181c3 100644 --- a/docs/bias/populations.md +++ b/docs/bias/populations.md @@ -12,8 +12,16 @@ The default is not a completely iid and balanced population. All populations are defined by the following variables: $A$, $Z$, $Y$, $X$. A population has a `sample` method and attributes for each component sampler. +## Object Structure + +```{eval-rst} +.. autoclass:: mlsim.bias.Population +``` + +This takes one sampler of each factor of the joint data distribution + ## Sampling bias Populations also have samplers that -insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls. For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times. +insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls. For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times. diff --git a/mlsim/bias/bias_components.py b/mlsim/bias/bias_components.py index 59f0506..4b50087 100644 --- a/mlsim/bias/bias_components.py +++ b/mlsim/bias/bias_components.py @@ -416,9 +416,9 @@ def __init__(self,dist,loc,spread): Parameters ---------- loc : list-like - one location parameter value per true value, protected attribute pair + one location parameter value per (true value, protected attribute) pair spread : list-like - one spread parameter value per proxy value, protected attribute pair + one spread parameter value per (proxy value, protected attribute) pair ''' theta_yaz = [[[(lii,sii) for lii,sii in zip(li,si)] for li in loc] for si in spread] diff --git a/mlsim/bias/populations.py b/mlsim/bias/populations.py index f2a7cb3..ae9bd16 100644 --- a/mlsim/bias/populations.py +++ b/mlsim/bias/populations.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -import aif360.datasets +from aif360.datasets import StructuredDataset from .bias_components import Demographic, Target, Feature, FeatureNoise default_params = {'dem':None,} @@ -142,11 +142,11 @@ def make_StructuredDataset(self,a,z,y,x): Returns -------- - aif360.datasets.StructuredDataset + aif360.datasets.StructuredDataset containing the data with y as the target and a as protected attribute. ''' df = self.make_DataFrame(a,z,y,x) - return aif360.datasets.StructuredDataset(df, ['y'], ['a']) + return StructuredDataset(df, ['y'], ['a']) def get_parameter_description(self): ''' From 827d1a482c6bbd5821e60f6a7bb8751147442197 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Fri, 10 Feb 2023 16:51:25 -0500 Subject: [PATCH 2/5] links --- docs/_config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_config.yml b/docs/_config.yml index e692473..c6b6b97 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -48,8 +48,8 @@ parse: # HTML-specific settings html: favicon : "" # A path to a favicon image - use_edit_page_button : false # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in - use_repository_button : false # Whether to add a link to your repository button + use_edit_page_button : true # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in + use_repository_button : true # Whether to add a link to your repository button use_issues_button : false # Whether to add an "open an issue" button use_multitoc_numbering : true # Continuous numbering across parts/chapters extra_navbar : Powered by Jupyter Book # Will be displayed underneath the left navbar. @@ -78,7 +78,7 @@ launch_buttons: repository: url : https://github.com/ml4sts/ml-sim # The URL to your book's repository - path_to_book : "" # A path to your book's folder, relative to the repository root. + path_to_book : "docs" # A path to your book's folder, relative to the repository root. branch : main # Which branch of the repository should be used when creating links ####################################################################################### From 1800c69958697983c6213e7c4261fd0174801407 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Tue, 9 May 2023 10:38:39 -0400 Subject: [PATCH 3/5] allow |A|>2 and some fixes --- mlsim/bias/bias_components.py | 111 ++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 38 deletions(-) diff --git a/mlsim/bias/bias_components.py b/mlsim/bias/bias_components.py index 4b50087..1b58243 100644 --- a/mlsim/bias/bias_components.py +++ b/mlsim/bias/bias_components.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd from collections import namedtuple +from collections.abc import Iterable DemParams = namedtuple('DemParams',['Pa','Pz_a']) TargetParams = namedtuple('TargetParams',['Py_az']) @@ -37,6 +38,8 @@ def __init__(self,rho_a=.5,rho_z=.5): default is independent sampling of a and z ''' Pa = [1-rho_a, rho_a] + self.A = [0, 1] + Pz = [1-rho_z, rho_z] super().__init__((Pa,[Pz,Pz])) @@ -56,7 +59,7 @@ def sample(self,N): a tuple of lenght 2 with elements a and z as column np arrays each of length N ''' - a = np.random.choice([0,1], p= self.params.Pa, size=N) + a = np.random.choice(self.A, p= self.params.Pa, size=N) z = [np.random.choice([0,1], p= self.params.Pz_a[ai]) for ai in a] return np.asarray(a).T,np.asarray(z).T @@ -110,14 +113,24 @@ class DemographicCorrelated(Demographic): def __init__(self,rho_a=.5,rho_z=[.5,.3]): ''' - P(A = 1) = rho_a - P(Z=1,A=0) = rho_z[0] - P(Z=1,A=1) = rho_z[1] + P(A = 1) = rho_a or P(A) = rho_a + P(Z=1|A=i) = rho_z[i] + + Parameters + rho_a : scalar or vector of floats + probablity of A = 1 or distribution of A + rho_z : vector of 2 or len(rho_a) + probability Z=1, for A = i + ''' + if isinstance(rho_a, Iterable): + Pa = rho_a + self.A = list(range(len(rho_a))) + else: + Pa = [1-rho_a, rho_a] + self.A = [0, 1] + + Pz_a = [[1-rho_zi, rho_zi] for rho_zi in rho_z] - ''' - Pa = [1-rho_a, rho_a] - Pz_a = [[1-rho_z[0], rho_z[0]], - [1-rho_z[1], rho_z[1]]] Sampler.__init__(self,(Pa,Pz_a)) @@ -125,7 +138,7 @@ class Target(Sampler): ''' ''' ParamCreator = TargetParams - def __init__(self,beta=0.05): + def __init__(self,beta=0.05,N_a=2): ''' P(Y=Z|A,Z ) = P(Y=Z) = 1-beta make errors with prob beta @@ -133,7 +146,7 @@ def __init__(self,beta=0.05): beta =0, makes Y =Z ''' pyeqz = [1-beta,beta] - Py_az = [[pyeqz,pyeqz],[pyeqz,pyeqz]] + Py_az = [[pyeqz,pyeqz]]*N_a super().__init__((Py_az,)) @@ -147,7 +160,6 @@ def sample(self,a,z): beta : float ''' - y = [np.random.choice([zi,1-zi],p= self.params.Py_az[ai][zi]) for ai,zi in zip(a,z)] @@ -157,15 +169,15 @@ def sample(self,a,z): class TargetDisadvantagedError(Target): ''' ''' - def __init__(self,beta=.1): + def __init__(self,beta=.1,N_a=2): ''' - make errors with prob beta + make errors with prob beta (advantaged, A=(N_a-1)) P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1 ''' pyeqz = [1-beta,beta] - Py_az = [[[1,0],[1,0]],[pyeqz,pyeqz]] + Py_az = [[pyeqz, pyeqz]]*(N_a-1) + [[1, 0], [1, 0]] Sampler.__init__(self,(Py_az,)) class TargetTwoError(Target): @@ -183,6 +195,23 @@ def __init__(self,beta=[0,.1]): Py_az = [[pyz_a0,pyz_a0],[pyz_a1,pyz_a1]] Sampler.__init__(self,(Py_az,)) + +class TargetAllAError(Target): + ''' + ''' + + def __init__(self, beta=[0, .1]): + ''' + make errors with prob beta + P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta1 + P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1-beta0 + + # ''' + # pyz_a0 = [1-beta[0], beta[0]] + # pyz_a1 = [1-beta[1], beta[1]] + Py_az = [[1-betaai, betaai]*2 for betaai in beta] + Sampler.__init__(self, (Py_az,)) + class TargetFlipNegative(Target): ''' ''' @@ -195,10 +224,10 @@ def __init__(self,beta=[0,.1]): P(Y=Z|Z =0) = 1 ''' - pyz1_a0 = [1-beta[0],beta[0]] - pyz1_a1 = [1-beta[1],beta[1]] + # pyz1_a0 = [1-beta[0],beta[0]] + # pyz1_a1 = [1-beta[1],beta[1]] no_error = [1,0] # if z=0, P(Y=z) =1 - Py_az = [[no_error,pyz1_a0],[no_error,pyz1_a1]] + Py_az = [[no_error, [1-betaai, betaai]] for betaai in beta] Sampler.__init__(self,(Py_az,)) class TargetFlipAllIndep(Target): @@ -214,9 +243,6 @@ def __init__(self,beta=[[.05,.1],[.05,.1]]): ''' - pyz1_a0 = [1-beta[0],beta[0]] - pyz1_a1 = [1-beta[1],beta[1]] - no_error = [1,0] # if z=0, P(Y=z) =1 Py_az = [[[1-b,b] for b in be] for be in beta] Sampler.__init__(self,(Py_az,)) @@ -239,7 +265,7 @@ class Feature(Sampler): ''' ParamCreator = FeatureParams def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]], - param_tuple = None): + param_tuple = None,N_a =2): ''' Parameters ---------- @@ -255,7 +281,8 @@ def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]], super().__init__(param_tuple) else: # default params passed - theta = [[mu,mu],[mu, mu]] + # mu has diffs for Z=0,1; repeat for all A for all Y + theta = [[mu]*N_a]*2 super().__init__((dist,theta)) def sample(self,a,z,y): @@ -281,19 +308,23 @@ def sample(self,a,z,y): if type(self.params.theta[0][0][0]) == tuple: # if a tuple, then expand and pass 2 params + x = [self.params.distfunc(*self.params.theta[yi][ai][zi]) - for ai,zi,yi in zip(z,a,y)] + for ai,zi,yi in zip(a,z,y)] else: x = [self.params.distfunc(self.params.theta[yi][ai][zi]) for ai,zi,yi in zip(a,z,y)] return np.asarray(x) +mvn = lambda mu,var :np.random.multivariate_normal(mu,var*np.eye(len(mu))) + class FeatureSharedParam(Feature): ''' feature sampler with one parameter shared across Z (eg shared spread) A and Y have no impact on X ''' - def __init__(self,dist,loc,spread): + + def __init__(self, loc, spread, dist=mvn,N_a=2): ''' unique locations and shared spread for no impact of A or Y @@ -309,14 +340,15 @@ def __init__(self,dist,loc,spread): ''' theta_z = [(li,spread) for li in loc] - theta = [[theta_z,theta_z],[theta_z, theta_z]] + theta = [[theta_z]*N_a]*2 super().__init__(param_tuple=(dist,theta)) class FeatureTwoParams(Feature): ''' feature sampler with two unique parameters per class ''' - def __init__(self,dist,loc,spread): + + def __init__(self, loc, spread, dist=mvn,N_a=2): ''' unique locations and shared spread for z, no impact of a an y @@ -331,8 +363,8 @@ def __init__(self,dist,loc,spread): spread parameter of dist, one per value of z ''' - theta_z = [(li,si) for li,si in zip(loc,spread)] - theta = [[theta_z,theta_z],[theta_z, theta_z]] + theta_z = [(li, si) for li, si in zip(loc, spread)] + theta = [[theta_z]*N_a]*2 super().__init__(param_tuple=(dist,theta)) class FeaturePerGroupTwoParam(Feature): @@ -348,13 +380,17 @@ def __init__(self,dist,loc,spread): dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y - loc : list-like length |Z| of lists length 2 + loc : list-like length |Z| of lists length |A| location parameter of dist, one per value of z,a - spread : list-like length |Z| of lists length 2 + spread : list-like length |Z| of lists length |A| spread parameter of dist, one per value of z,a - ''' + # ''' + # print(len(loc), len(spread)) + # print(len(loc[0]), len(spread[0])) theta_za = [[(lii,sii) for lii,sii in zip(li,si)] for li,si in zip(loc,spread)] + # repeat so that features do not vary with Y theta = [theta_za,theta_za] + # print(theta) super().__init__(param_tuple=(dist,theta)) class FeaturePerGroupSharedParamWithinGroup(Feature): @@ -434,16 +470,15 @@ class FeatureNoise(Sampler): ''' ParamCreator = NoiseParams - def __init__(self,dist= shape_spread_only_mvn,sig = 1.0): + def __init__(self, dist=shape_spread_only_mvn, sig=1.0, N_a=2): ''' ''' if type(sig) ==float: # constant noise - theta = [[[sig,sig],[sig,sig]],[[sig,sig],[sig,sig]]] - elif len(sig) ==2: + theta = [[[sig,sig]]*N_a]*2 + else: # diff noise for protected attributes - theta = [[[sig[0],sig[0]],[sig[1],sig[1]]], - [[sig[0],sig[0]],[sig[1],sig[1]]]] + theta = [[sigi,sigi] for sigi in sig]*2 super().__init__((dist,theta)) @@ -538,7 +573,7 @@ def sample(a,z,y,x,dist,theta): # functions for combining noise and true vectors x_a = {0: lambda x,n: np.concatenate((x[:d_noise],n)), 1: lambda x,n: np.concatenate((n, x[d_shared-1:d], x[:d_noise]))} - x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x_z,x_n)] + x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x,x_n)] x = np.asarray(x) return x @@ -551,7 +586,7 @@ def sample(a,z,y,x,dist,theta): -def feature_proxy(a,z,y,dist,theta): +def feature_proxy(a,z,y,distfunc,theta): ''' some features are related to the ground truth and some are realated to the proxy, From fb3b0ec036d9e9ea08a42dd78433a59f47a5acf0 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Fri, 19 Apr 2024 16:04:52 -0400 Subject: [PATCH 4/5] burmp reqs to latest/any; fix docs --- docs/_toc.yml | 4 ++-- docs/anomaly/{api.rst => api.md} | 2 ++ docs/bias/{api.rst => api.md} | 3 +++ requirements.txt | 12 ++++++------ 4 files changed, 13 insertions(+), 8 deletions(-) rename docs/anomaly/{api.rst => api.md} (81%) rename docs/bias/{api.rst => api.md} (79%) diff --git a/docs/_toc.yml b/docs/_toc.yml index eb71690..062ce8d 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -6,14 +6,14 @@ parts: - file: anomaly/simpsons_paradox - file: anomaly/rate_sp - file: anomaly/regression_sp - - file: anomaly/api.rst + - file: anomaly/api # sections: # - file: path/to/part1/chapter2/section1 - caption: Biases chapters: - file: bias/populations - file: bias/demo_populations - - file: bias/api.rst + - file: bias/api # sections: # - file: path/to/part2/chapter2/section1 - caption: Basic Generators diff --git a/docs/anomaly/api.rst b/docs/anomaly/api.md similarity index 81% rename from docs/anomaly/api.rst rename to docs/anomaly/api.md index b2fea45..bc2c80c 100644 --- a/docs/anomaly/api.rst +++ b/docs/anomaly/api.md @@ -1,5 +1,7 @@ API Reference ============= +```{eval-rst} .. automodule:: mlsim.anomaly :members: +``` \ No newline at end of file diff --git a/docs/bias/api.rst b/docs/bias/api.md similarity index 79% rename from docs/bias/api.rst rename to docs/bias/api.md index 8e1f43b..6e2a380 100644 --- a/docs/bias/api.rst +++ b/docs/bias/api.md @@ -1,5 +1,8 @@ API Reference ============= + +```{eval-rst} .. automodule:: mlsim.bias :members: +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8f0f856..3d36532 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ jupyter-book -aif360==0.4.0 -matplotlib==3.5.1 -numpy==1.21.5 -pandas==1.1.5 -scipy==1.5.4 -setuptools==39.0.1 +aif360 +matplotlib +numpy +pandas +scipy +setuptools seaborn sphinx myst-nb From 337d4ff46e78a6b492d5401c57e229252074f654 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Fri, 19 Apr 2024 16:11:33 -0400 Subject: [PATCH 5/5] pytest --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3d36532..68766c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ seaborn sphinx myst-nb myst-parser +pytest \ No newline at end of file