Skip to content

Commit

Permalink
rough errors process 50%
Browse files Browse the repository at this point in the history
  • Loading branch information
johanos1 committed Mar 26, 2024
1 parent 8d08a77 commit 81df82a
Show file tree
Hide file tree
Showing 11 changed files with 353 additions and 520 deletions.
21 changes: 11 additions & 10 deletions leakpro.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
"""Main script to run LEAKPRO on a target model."""

import joblib
import logging
import pickle
import numpy as np
from pathlib import Path
import random
import time
from pathlib import Path

import numpy as np
import torch
import yaml

import leakpro.train as util
from leakpro import dataset, models
from leakpro.mia_attacks.attack_scheduler import AttackScheduler
from leakpro.reporting.utils import prepare_priavcy_risk_report
import leakpro.train as util


def setup_log(name: str, save_file: bool) -> logging.Logger:
Expand Down Expand Up @@ -50,10 +51,10 @@ def setup_log(name: str, save_file: bool) -> logging.Logger:
if __name__ == "__main__":

RETRAIN = True
#args = "./config/adult.yaml"
#args = "./config/adult.yaml" # noqa: ERA001
args = "./config/cifar10.yaml"
with open(args, "rb") as f:
configs = yaml.load(f, Loader=yaml.Loader)
configs = yaml.safe_load(f)

# Set the random seed, log_dir and inference_game
torch.manual_seed(configs["run"]["random_seed"])
Expand Down Expand Up @@ -105,19 +106,19 @@ def setup_log(name: str, save_file: bool) -> logging.Logger:
data_file = configs["data"]["dataset"]
dataset_path = f"{data_dir}/{data_file}.pkl"
with open(dataset_path, "rb") as file:
population = pickle.load(file)
population = joblib.load(file)

# Get the training and test data
train_test_data = train_test_dataset

# Get the target model + metadata
target_model_metadata_path = f"{log_dir}/models_metadata.pkl"
with open(target_model_metadata_path, "rb") as f:
target_model_metadata = pickle.load(f)
target_model_metadata = joblib.load(f)
target_model_path = f"{log_dir}/model_0.pkl"
with open(target_model_path, "rb") as f:
if "adult" in configs["data"]["dataset"]:
target_model = models.NN(
target_model = models.NN(s
configs["train"]["inputs"], configs["train"]["outputs"]
) # TODO: read metadata to get the model
elif "cifar10" in configs["data"]["dataset"]:
Expand Down
222 changes: 0 additions & 222 deletions leakpro/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,228 +38,6 @@ def __getitem__(self, idx):
y = torch.tensor(self.y[idx], dtype=torch.long)
return X, y

########################################################################################################################
# DATASET CLASS
########################################################################################################################


# class Dataset:
# """
# Wrapper around a dictionary-like formatted dataset, with functions to run preprocessing, to define default
# input/output features, and to split a dataset easily.
# """

# def __init__(
# self,
# data_dict: dict,
# default_input: str,
# default_output: str,
# default_group: str = None,
# preproc_fn_dict: dict = None,
# preprocessed: bool = False,
# ):
# """Constructor

# Args:
# data_dict: Contains the dataset as a dict.
# default_input: The key of the data_dict that should be used by default to get the input of a model.
# default_output: The key of the data_dict that should be used by default to get the expected output
# of a model.
# default_group: The key of the data_dict that shouuld be used by default to get the group of the data points.
# This is to contruct class dependent threshold.
# preproc_fn_dict: Contains optional preprocessing functions for each feature.
# preprocessed: Indicates if the preprocessing of preproc_fn_dict has already been applied.
# """

# # Store parameters
# self.data_dict = data_dict
# self.default_input = default_input
# self.default_output = default_output
# self.default_group = default_group
# self.preproc_fn_dict = preproc_fn_dict

# # Store splits names and features names
# self.splits = list(self.data_dict)
# self.features = list(self.data_dict[self.splits[0]])

# # If preprocessing functions were passed as parameters, execute them
# if not preprocessed and preproc_fn_dict is not None:
# self.preprocess()

# def __len__(self):
# return len(self.data_dict[self.default_output])

# def preprocess(self):
# """
# Preprocessing function, executed by the constructor, based on the preproc_fn_dict attribute.
# """
# for split, feature in product(self.splits, self.features):
# if feature in list(self.preproc_fn_dict):
# fn = self.preproc_fn_dict[feature]
# self.data_dict[split][feature] = fn(self.data_dict[split][feature])

# def get_feature(self, split_name: str, feature_name: str, indices: list = None):
# """Returns a specific feature from samples of a specific split.

# Args:
# split_name: Name of the split.
# feature_name: Name of the feature.
# indices: Optional list of indices. If not specified, the entire subset is returned.

# Returns:
# The requested feature, from samples of the requested split.
# """

# # Two placeholders can be used to trigger either the default input or the default output, as specified during
# # object creation
# if feature_name == "<default_input>":
# feature_name = self.default_input
# elif feature_name == "<default_output>":
# feature_name = self.default_output
# elif feature_name == "<default_group>":
# feature_name = self.default_group

# # If 'indices' is not specified, returns the entire array. Else just return those indices
# if indices is None:
# return self.data_dict[split_name][feature_name]
# else:
# return self.data_dict[split_name][feature_name][indices]

# def subdivide(
# self,
# num_splits: int,
# split_names: list = None,
# method: str = "independent",
# split_size: Union[int, Dict[str, int]] = None,
# delete_original: bool = False,
# in_place: bool = True,
# return_results: bool = False,
# ):
# """Subdivides the splits contained in split_names into sub-splits, e.g. for shadow model training.

# Args:
# num_splits: Number of sub-splits per original split.
# split_names: The splits to subdivide (e.g. train and test). By default, includes all splits.
# method: Either independent or random. If method is independent, then the sub-splits are a partition of the
# original split (i.e. they contain the entire split without repetition). If method is random, then each
# sub-split is a random subset of the original split (i.e. some samples might be missing or repeated). If
# method is hybrid, then each sub-split is a random subset of the original split, with the guarantee that
# the 1st one is not overlapping with the others.
# split_size: If method is random, this is the size of one split (ignored if method is independent). Can
# either be an integer, or a dictionary of integer (one per split).
# delete_original: Indicates if the original split should be deleted.
# in_place: Indicates if the new splits should be included in the parent object or not
# return_results: Indicates if the new splits should be returned or not

# Returns:
# If in_place, a list of new Dataset objects, with the sub-splits. Otherwise, nothing, as the results are
# stored in self.data_dict.
# """

# # By default, includes all splits.
# if split_names is None:
# split_names = self.splits

# # List of results if in_place is False
# new_datasets_dict = [{} for _ in range(num_splits)]

# for split in split_names:

# if split_size is not None:
# parsed_split_size = (
# split_size if isinstance(split_size, int) else split_size[split]
# )

# # If method is random, then each sub-split is a random subset of the original split.
# if method == "random":
# assert (
# split_size is not None
# ), 'Argument split_size is required when method is "random" or "hybrid"'
# indices = np.random.randint(
# self.data_dict[split][self.features[0]].shape[0],
# size=(num_splits, parsed_split_size),
# )

# # If method is independent, then the sub-splits are a partition of the original split.
# elif method == "independent":
# indices = np.arange(self.data_dict[split][self.features[0]].shape[0])
# np.random.shuffle(indices)
# indices = np.array_split(indices, num_splits)

# # If method is hybrid, then each sub-split is a random subset of the original split, with the guarantee that
# # the 1st one is not overlapping with the others
# elif method == "hybrid":
# assert (
# split_size is not None
# ), 'Argument split_size is required when method is "random" or "hybrid"'
# available_indices = np.arange(
# self.data_dict[split][self.features[0]].shape[0]
# )
# indices_a = np.random.choice(
# available_indices, size=(1, parsed_split_size), replace=False
# )
# available_indices = np.setdiff1d(available_indices, indices_a.flatten())
# indices_b = np.random.choice(
# available_indices,
# size=(num_splits - 1, parsed_split_size),
# replace=True,
# )
# indices = np.concatenate((indices_a, indices_b))

# else:
# raise ValueError(f'Split method "{method}" does not exist.')

# for split_n in range(num_splits):
# # Fill the dictionary if in_place is True
# if in_place:
# self.data_dict[f"{split}{split_n:03d}"] = {}
# for feature in self.features:
# self.data_dict[f"{split}{split_n:03d}"][feature] = (
# self.data_dict[split][feature][indices[split_n]]
# )
# # Create new dictionaries if return_results is True
# if return_results:
# new_datasets_dict[split_n][f"{split}"] = {}
# for feature in self.features:
# new_datasets_dict[split_n][f"{split}"][feature] = (
# self.data_dict[split][feature][indices[split_n]]
# )

# # delete_original indicates if the original split should be deleted.
# if delete_original:
# del self.data_dict[split]

# # Update the list of splits
# self.splits = list(self.data_dict)

# # Return new datasets if return_results is True
# if return_results:
# return [
# Dataset(
# data_dict=new_datasets_dict[i],
# default_input=self.default_input,
# default_output=self.default_output,
# default_group=self.default_group,
# preproc_fn_dict=self.preproc_fn_dict,
# preprocessed=True,
# )
# for i in range(num_splits)
# ]

# def __str__(self):
# """
# Returns a string describing the dataset.
# """
# txt = [
# f'{" DATASET OBJECT ":=^48}',
# f"Splits = {self.splits}",
# f"Features = {self.features}",
# f"Default features = {self.default_input} --> {self.default_output}",
# "=" * 48,
# ]
# return "\n".join(txt)


class TabularDataset(Dataset):
"""Tabular dataset."""

Expand Down
Loading

0 comments on commit 81df82a

Please sign in to comment.