From d84962d9a62ae3772efc3f44de671fd91e28ebb1 Mon Sep 17 00:00:00 2001
From: Sarah M Brown <brownsarahm@uri.edu>
Date: Fri, 10 Feb 2023 16:37:29 -0500
Subject: [PATCH 1/5] fix doc build

---
 docs/bias/populations.md      | 10 +++++++++-
 mlsim/bias/bias_components.py |  4 ++--
 mlsim/bias/populations.py     |  6 +++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/docs/bias/populations.md b/docs/bias/populations.md
index cd5771b..90181c3 100644
--- a/docs/bias/populations.md
+++ b/docs/bias/populations.md
@@ -12,8 +12,16 @@ The default is not a completely iid and balanced population. All populations are
 defined by the following variables: $A$, $Z$, $Y$, $X$. A population has a `sample` method
 and attributes for each component sampler.
 
+## Object Structure
+
+```{eval-rst}
+.. autoclass:: mlsim.bias.Population
+```
+
+This takes one sampler of each factor of the joint data distribution
+
 
 ## Sampling bias
 
 Populations also have samplers that
-insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls.  For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times. 
+insert sampling, rather than population level biases. This allows for the creation of a population with one set of biases and to use the same object to draw additional datasets that have additionally biased sampls.  For example you may wish to have training data and audit datasets that have different disributions to demonstrate the impact of a biased sampling at one of those times.
diff --git a/mlsim/bias/bias_components.py b/mlsim/bias/bias_components.py
index 59f0506..4b50087 100644
--- a/mlsim/bias/bias_components.py
+++ b/mlsim/bias/bias_components.py
@@ -416,9 +416,9 @@ def __init__(self,dist,loc,spread):
         Parameters
         ----------
         loc : list-like
-            one location parameter value per true value, protected attribute pair
+            one location parameter value per (true value, protected attribute) pair
         spread : list-like
-            one spread parameter value per proxy value, protected attribute pair
+            one spread parameter value per (proxy value, protected attribute) pair
         '''
         theta_yaz = [[[(lii,sii) for lii,sii in zip(li,si)]
                                 for li in loc] for si in spread]
diff --git a/mlsim/bias/populations.py b/mlsim/bias/populations.py
index f2a7cb3..ae9bd16 100644
--- a/mlsim/bias/populations.py
+++ b/mlsim/bias/populations.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-import aif360.datasets
+from aif360.datasets import  StructuredDataset
 from .bias_components import Demographic, Target, Feature, FeatureNoise
 
 default_params = {'dem':None,}
@@ -142,11 +142,11 @@ def make_StructuredDataset(self,a,z,y,x):
 
         Returns
         --------
-        aif360.datasets.StructuredDataset
+        aif360.datasets.StructuredDataset containing the data with y as the target and a as protected attribute. 
 
         '''
         df = self.make_DataFrame(a,z,y,x)
-        return aif360.datasets.StructuredDataset(df, ['y'], ['a'])
+        return StructuredDataset(df, ['y'], ['a'])
 
     def get_parameter_description(self):
         '''

From 827d1a482c6bbd5821e60f6a7bb8751147442197 Mon Sep 17 00:00:00 2001
From: Sarah M Brown <brownsarahm@uri.edu>
Date: Fri, 10 Feb 2023 16:51:25 -0500
Subject: [PATCH 2/5] links

---
 docs/_config.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/_config.yml b/docs/_config.yml
index e692473..c6b6b97 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -48,8 +48,8 @@ parse:
 # HTML-specific settings
 html:
   favicon                   : ""  # A path to a favicon image
-  use_edit_page_button      : false  # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in
-  use_repository_button     : false  # Whether to add a link to your repository button
+  use_edit_page_button      : true  # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in
+  use_repository_button     : true  # Whether to add a link to your repository button
   use_issues_button         : false  # Whether to add an "open an issue" button
   use_multitoc_numbering    : true   # Continuous numbering across parts/chapters
   extra_navbar              : Powered by <a href="https://jupyterbook.org">Jupyter Book</a>  # Will be displayed underneath the left navbar.
@@ -78,7 +78,7 @@ launch_buttons:
 
 repository:
   url                       : https://github.com/ml4sts/ml-sim # The URL to your book's repository
-  path_to_book              : ""  # A path to your book's folder, relative to the repository root.
+  path_to_book              : "docs"  # A path to your book's folder, relative to the repository root.
   branch                    : main  # Which branch of the repository should be used when creating links
 
 #######################################################################################

From 1800c69958697983c6213e7c4261fd0174801407 Mon Sep 17 00:00:00 2001
From: Sarah M Brown <brownsarahm@uri.edu>
Date: Tue, 9 May 2023 10:38:39 -0400
Subject: [PATCH 3/5] allow |A|>2 and some fixes

---
 mlsim/bias/bias_components.py | 111 ++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 38 deletions(-)

diff --git a/mlsim/bias/bias_components.py b/mlsim/bias/bias_components.py
index 4b50087..1b58243 100644
--- a/mlsim/bias/bias_components.py
+++ b/mlsim/bias/bias_components.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 from collections import namedtuple
+from collections.abc import Iterable
 
 DemParams = namedtuple('DemParams',['Pa','Pz_a'])
 TargetParams = namedtuple('TargetParams',['Py_az'])
@@ -37,6 +38,8 @@ def __init__(self,rho_a=.5,rho_z=.5):
         default is independent sampling of a and z
         '''
         Pa = [1-rho_a, rho_a]
+        self.A = [0, 1]
+
         Pz = [1-rho_z, rho_z]
         super().__init__((Pa,[Pz,Pz]))
 
@@ -56,7 +59,7 @@ def sample(self,N):
             a tuple of lenght 2 with elements a and z as column np arrays each
             of length N
         '''
-        a = np.random.choice([0,1], p= self.params.Pa, size=N)
+        a = np.random.choice(self.A, p= self.params.Pa, size=N)
         z = [np.random.choice([0,1], p= self.params.Pz_a[ai]) for ai in a]
 
         return np.asarray(a).T,np.asarray(z).T
@@ -110,14 +113,24 @@ class DemographicCorrelated(Demographic):
 
     def __init__(self,rho_a=.5,rho_z=[.5,.3]):
         '''
-        P(A = 1) = rho_a
-        P(Z=1,A=0) = rho_z[0]
-        P(Z=1,A=1) = rho_z[1]
+        P(A = 1) = rho_a or P(A) = rho_a
+        P(Z=1|A=i) = rho_z[i]
+
+        Parameters
+        rho_a : scalar or vector of floats
+            probablity of A = 1 or distribution of A
+        rho_z : vector of 2 or len(rho_a)
+            probability Z=1, for A = i
+        '''
+        if isinstance(rho_a, Iterable):
+            Pa = rho_a
+            self.A = list(range(len(rho_a)))
+        else:
+            Pa = [1-rho_a, rho_a]
+            self.A = [0, 1]
+
+        Pz_a = [[1-rho_zi, rho_zi] for rho_zi in rho_z]
 
-        '''
-        Pa = [1-rho_a, rho_a]
-        Pz_a = [[1-rho_z[0], rho_z[0]],
-             [1-rho_z[1], rho_z[1]]]
         Sampler.__init__(self,(Pa,Pz_a))
 
 
@@ -125,7 +138,7 @@ class Target(Sampler):
     '''
     '''
     ParamCreator = TargetParams
-    def __init__(self,beta=0.05):
+    def __init__(self,beta=0.05,N_a=2):
         '''
         P(Y=Z|A,Z ) = P(Y=Z) = 1-beta
         make errors with prob beta
@@ -133,7 +146,7 @@ def __init__(self,beta=0.05):
         beta =0, makes Y =Z
         '''
         pyeqz = [1-beta,beta]
-        Py_az = [[pyeqz,pyeqz],[pyeqz,pyeqz]]
+        Py_az = [[pyeqz,pyeqz]]*N_a
         super().__init__((Py_az,))
 
 
@@ -147,7 +160,6 @@ def sample(self,a,z):
         beta : float
 
         '''
-
         y = [np.random.choice([zi,1-zi],p= self.params.Py_az[ai][zi])
                                             for ai,zi in zip(a,z)]
 
@@ -157,15 +169,15 @@ def sample(self,a,z):
 class TargetDisadvantagedError(Target):
     '''
     '''
-    def __init__(self,beta=.1):
+    def __init__(self,beta=.1,N_a=2):
         '''
-        make errors with prob beta
+        make errors with prob beta (advantaged, A=(N_a-1))
         P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta
         P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1
 
         '''
         pyeqz = [1-beta,beta]
-        Py_az = [[[1,0],[1,0]],[pyeqz,pyeqz]]
+        Py_az = [[pyeqz, pyeqz]]*(N_a-1) + [[1, 0], [1, 0]]
         Sampler.__init__(self,(Py_az,))
 
 class TargetTwoError(Target):
@@ -183,6 +195,23 @@ def __init__(self,beta=[0,.1]):
         Py_az = [[pyz_a0,pyz_a0],[pyz_a1,pyz_a1]]
         Sampler.__init__(self,(Py_az,))
 
+
+class TargetAllAError(Target):
+    '''
+    '''
+
+    def __init__(self, beta=[0, .1]):
+        '''
+        make errors with prob beta
+        P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta1
+        P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1-beta0
+
+        # '''
+        # pyz_a0 = [1-beta[0], beta[0]]
+        # pyz_a1 = [1-beta[1], beta[1]]
+        Py_az =  [[1-betaai, betaai]*2 for betaai in beta]
+        Sampler.__init__(self, (Py_az,))
+
 class TargetFlipNegative(Target):
     '''
     '''
@@ -195,10 +224,10 @@ def __init__(self,beta=[0,.1]):
         P(Y=Z|Z  =0) = 1
 
         '''
-        pyz1_a0 = [1-beta[0],beta[0]]
-        pyz1_a1 = [1-beta[1],beta[1]]
+        # pyz1_a0 = [1-beta[0],beta[0]]
+        # pyz1_a1 = [1-beta[1],beta[1]]
         no_error = [1,0] # if z=0, P(Y=z) =1
-        Py_az = [[no_error,pyz1_a0],[no_error,pyz1_a1]]
+        Py_az = [[no_error, [1-betaai, betaai]] for betaai in beta]
         Sampler.__init__(self,(Py_az,))
 
 class TargetFlipAllIndep(Target):
@@ -214,9 +243,6 @@ def __init__(self,beta=[[.05,.1],[.05,.1]]):
 
 
         '''
-        pyz1_a0 = [1-beta[0],beta[0]]
-        pyz1_a1 = [1-beta[1],beta[1]]
-        no_error = [1,0] # if z=0, P(Y=z) =1
         Py_az = [[[1-b,b] for b in be] for be in beta]
         Sampler.__init__(self,(Py_az,))
 
@@ -239,7 +265,7 @@ class Feature(Sampler):
     '''
     ParamCreator = FeatureParams
     def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]],
-                            param_tuple = None):
+                            param_tuple = None,N_a =2):
         '''
         Parameters
         ----------
@@ -255,7 +281,8 @@ def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]],
             super().__init__(param_tuple)
         else:
             # default params passed
-            theta = [[mu,mu],[mu, mu]]
+            # mu has diffs for Z=0,1; repeat for all A for all Y
+            theta = [[mu]*N_a]*2
             super().__init__((dist,theta))
 
     def sample(self,a,z,y):
@@ -281,19 +308,23 @@ def sample(self,a,z,y):
 
         if type(self.params.theta[0][0][0]) == tuple:
             # if a tuple, then expand and pass 2 params
+            
             x = [self.params.distfunc(*self.params.theta[yi][ai][zi])
-                                        for ai,zi,yi in zip(z,a,y)]
+                                        for ai,zi,yi in zip(a,z,y)]
         else:
             x = [self.params.distfunc(self.params.theta[yi][ai][zi])
                                     for ai,zi,yi in zip(a,z,y)]
         return np.asarray(x)
 
+mvn = lambda mu,var :np.random.multivariate_normal(mu,var*np.eye(len(mu)))
+
 class FeatureSharedParam(Feature):
     '''
     feature sampler with one parameter shared across Z (eg shared spread)
     A and Y have no impact on X
     '''
-    def __init__(self,dist,loc,spread):
+
+    def __init__(self, loc, spread, dist=mvn,N_a=2):
         '''
         unique locations and shared spread for no impact of A or Y
 
@@ -309,14 +340,15 @@ def __init__(self,dist,loc,spread):
         '''
 
         theta_z = [(li,spread) for li in loc]
-        theta = [[theta_z,theta_z],[theta_z, theta_z]]
+        theta = [[theta_z]*N_a]*2
         super().__init__(param_tuple=(dist,theta))
 
 class FeatureTwoParams(Feature):
     '''
     feature sampler with two unique parameters per class
     '''
-    def __init__(self,dist,loc,spread):
+
+    def __init__(self, loc, spread, dist=mvn,N_a=2):
         '''
         unique locations and shared spread for z, no impact of a an y
 
@@ -331,8 +363,8 @@ def __init__(self,dist,loc,spread):
             spread parameter of dist, one per value of z
         '''
 
-        theta_z = [(li,si) for li,si in zip(loc,spread)]
-        theta = [[theta_z,theta_z],[theta_z, theta_z]]
+        theta_z = [(li, si) for li, si in zip(loc, spread)]
+        theta = [[theta_z]*N_a]*2
         super().__init__(param_tuple=(dist,theta))
 
 class FeaturePerGroupTwoParam(Feature):
@@ -348,13 +380,17 @@ def __init__(self,dist,loc,spread):
         dist : function handle
             function to sample X|parameters where the paramters are dependend on
              Z,A,Y
-        loc : list-like length |Z| of lists length 2
+        loc : list-like length |Z| of lists length |A|
             location parameter of dist, one per value of z,a
-        spread : list-like length |Z| of lists length 2
+        spread : list-like length |Z| of lists length  |A|
             spread parameter of dist, one per value of z,a
-        '''
+        # '''
+        # print(len(loc), len(spread))
+        # print(len(loc[0]), len(spread[0]))
         theta_za = [[(lii,sii) for lii,sii in zip(li,si)] for li,si in zip(loc,spread)]
+        # repeat so that features do not vary with Y
         theta = [theta_za,theta_za]
+        # print(theta)
         super().__init__(param_tuple=(dist,theta))
 
 class FeaturePerGroupSharedParamWithinGroup(Feature):
@@ -434,16 +470,15 @@ class FeatureNoise(Sampler):
     '''
     ParamCreator = NoiseParams
 
-    def __init__(self,dist= shape_spread_only_mvn,sig = 1.0):
+    def __init__(self, dist=shape_spread_only_mvn, sig=1.0, N_a=2):
         '''
         '''
         if type(sig) ==float:
             # constant noise
-            theta = [[[sig,sig],[sig,sig]],[[sig,sig],[sig,sig]]]
-        elif len(sig) ==2:
+            theta = [[[sig,sig]]*N_a]*2
+        else:
             # diff noise for protected attributes
-            theta = [[[sig[0],sig[0]],[sig[1],sig[1]]],
-                    [[sig[0],sig[0]],[sig[1],sig[1]]]]
+            theta = [[sigi,sigi] for sigi in sig]*2
 
         super().__init__((dist,theta))
 
@@ -538,7 +573,7 @@ def sample(a,z,y,x,dist,theta):
         # functions for combining noise and true vectors
         x_a = {0: lambda x,n: np.concatenate((x[:d_noise],n)),
               1: lambda x,n: np.concatenate((n, x[d_shared-1:d],  x[:d_noise]))}
-        x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x_z,x_n)]
+        x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x,x_n)]
         x = np.asarray(x)
 
         return x
@@ -551,7 +586,7 @@ def sample(a,z,y,x,dist,theta):
 
 
 
-def feature_proxy(a,z,y,dist,theta):
+def feature_proxy(a,z,y,distfunc,theta):
     '''
     some features are related to the ground truth and some are realated to the
     proxy,

From fb3b0ec036d9e9ea08a42dd78433a59f47a5acf0 Mon Sep 17 00:00:00 2001
From: Sarah M Brown <brownsarahm@uri.edu>
Date: Fri, 19 Apr 2024 16:04:52 -0400
Subject: [PATCH 4/5] burmp reqs to latest/any; fix docs

---
 docs/_toc.yml                    |  4 ++--
 docs/anomaly/{api.rst => api.md} |  2 ++
 docs/bias/{api.rst => api.md}    |  3 +++
 requirements.txt                 | 12 ++++++------
 4 files changed, 13 insertions(+), 8 deletions(-)
 rename docs/anomaly/{api.rst => api.md} (81%)
 rename docs/bias/{api.rst => api.md} (79%)

diff --git a/docs/_toc.yml b/docs/_toc.yml
index eb71690..062ce8d 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -6,14 +6,14 @@ parts:
     - file: anomaly/simpsons_paradox
     - file: anomaly/rate_sp
     - file: anomaly/regression_sp
-    - file: anomaly/api.rst
+    - file: anomaly/api
       # sections:
       # - file: path/to/part1/chapter2/section1
   - caption: Biases
     chapters:
     - file: bias/populations
     - file: bias/demo_populations
-    - file: bias/api.rst
+    - file: bias/api
       # sections:
       # - file: path/to/part2/chapter2/section1
   - caption: Basic Generators
diff --git a/docs/anomaly/api.rst b/docs/anomaly/api.md
similarity index 81%
rename from docs/anomaly/api.rst
rename to docs/anomaly/api.md
index b2fea45..bc2c80c 100644
--- a/docs/anomaly/api.rst
+++ b/docs/anomaly/api.md
@@ -1,5 +1,7 @@
 API Reference
 =============
 
+```{eval-rst}
 .. automodule:: mlsim.anomaly
     :members:
+```
\ No newline at end of file
diff --git a/docs/bias/api.rst b/docs/bias/api.md
similarity index 79%
rename from docs/bias/api.rst
rename to docs/bias/api.md
index 8e1f43b..6e2a380 100644
--- a/docs/bias/api.rst
+++ b/docs/bias/api.md
@@ -1,5 +1,8 @@
 API Reference
 =============
 
+
+```{eval-rst}
 .. automodule:: mlsim.bias
     :members:
+```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8f0f856..3d36532 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 jupyter-book
-aif360==0.4.0
-matplotlib==3.5.1
-numpy==1.21.5
-pandas==1.1.5
-scipy==1.5.4
-setuptools==39.0.1
+aif360
+matplotlib
+numpy
+pandas
+scipy
+setuptools
 seaborn
 sphinx
 myst-nb

From 337d4ff46e78a6b492d5401c57e229252074f654 Mon Sep 17 00:00:00 2001
From: Sarah M Brown <brownsarahm@uri.edu>
Date: Fri, 19 Apr 2024 16:11:33 -0400
Subject: [PATCH 5/5] pytest

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 3d36532..68766c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ seaborn
 sphinx
 myst-nb
 myst-parser
+pytest
\ No newline at end of file