From a5f46d75e455ab36c5557e26425a7575f17bb1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=AA=20Nguy=C3=AAn=20Hoang?= Date: Mon, 17 Feb 2025 09:56:47 +0100 Subject: [PATCH] WIP models --- .../modules/preference_learning/base.py | 6 ++-- .../generalized_bradley_terry.py | 2 +- .../scaling/lipschitz_quantile_shift.py | 6 ++-- .../modules/scaling/lipschitz_standardize.py | 6 ++-- .../src/solidago/modules/scaling/mehestan.py | 24 +++++++++---- solidago/src/solidago/state/__init__.py | 6 ++-- .../src/solidago/state/models/__init__.py | 4 +-- solidago/src/solidago/state/models/base.py | 21 +++++++---- solidago/src/solidago/state/models/direct.py | 2 +- solidago/src/solidago/state/models/scaled.py | 35 ++++++++++++++----- solidago/src/solidago/state/models/score.py | 25 +++++++++---- .../src/solidago/state/models/user_models.py | 26 +++++++++++--- 12 files changed, 115 insertions(+), 48 deletions(-) diff --git a/solidago/src/solidago/modules/preference_learning/base.py b/solidago/src/solidago/modules/preference_learning/base.py index c46a3d0a51..3e48f6a338 100644 --- a/solidago/src/solidago/modules/preference_learning/base.py +++ b/solidago/src/solidago/modules/preference_learning/base.py @@ -23,7 +23,7 @@ def __call__(self, for user in users: logger.info(f" Learning user {user}'s base model") result[user] = self.user_learn(user, entities, - assessments.get(user), comparisons.get(user), user_models[user].base_model()[0]) + assessments.get(user), comparisons.get(user), user_models[user].base_model()) return result @abstractmethod @@ -32,7 +32,7 @@ def user_learn(self, entities: Entities, assessments: Assessments, # key_names == ["criterion", "entity_name"] comparisons: Comparisons, # key_names == ["criterion", "left_name", "right_name"] - base_model: BaseModel - ) -> BaseModel: + base_model: ScoringModel + ) -> ScoringModel: """Learns a scoring model, given user judgments of entities """ raise NotImplementedError diff --git a/solidago/src/solidago/modules/preference_learning/generalized_bradley_terry.py b/solidago/src/solidago/modules/preference_learning/generalized_bradley_terry.py index 3a87dbef63..f7c39cce79 100644 --- a/solidago/src/solidago/modules/preference_learning/generalized_bradley_terry.py +++ b/solidago/src/solidago/modules/preference_learning/generalized_bradley_terry.py @@ -111,7 +111,7 @@ def user_learn(self, entities: Entities, assessments: Assessments, # Not used comparisons: Comparisons, # key_names == ["criterion", "left_name", "right_name"] - init_model: BaseModel, + init_model: ScoringModel, ) -> DirectScoring: """ Learns only based on comparisons """ if self.last_comparison_only: diff --git a/solidago/src/solidago/modules/scaling/lipschitz_quantile_shift.py b/solidago/src/solidago/modules/scaling/lipschitz_quantile_shift.py index 04959e6d37..b8fe377e0e 100644 --- a/solidago/src/solidago/modules/scaling/lipschitz_quantile_shift.py +++ b/solidago/src/solidago/modules/scaling/lipschitz_quantile_shift.py @@ -35,7 +35,7 @@ def __call__(self, entities: Entities, user_models: UserModels) -> UserModels: Will be scaled by the Scaling method """ scores = user_models.score(entities).reorder_keys(["criterion", "username", "entity_name"]) - scales = ScaleDict(key_names=["criterion"]) # the same scale will apply to all users + translations = MultiScore() for criterion in scores.get_set("criterion"): scores_df = scores[criterion].to_df() weights = 1 / scores_df.groupby("username").transform("size") @@ -48,10 +48,10 @@ def __call__(self, entities: Entities, user_models: UserModels) -> UserModels: right_uncertainties=np.array(scores_df["right_unc"], dtype=np.float64), error=self.error, ) + self.target_score - scales[criterion] = (1, 0, 0, translation_value, 0, 0) + translations.set(criterion, translation_value) return UserModels({ - username: ScaledModel(model, scales, note="quantile_shift") + username: ScaledModel(model, translations=translations, note="quantile_shift") for username, model in user_models }) diff --git a/solidago/src/solidago/modules/scaling/lipschitz_standardize.py b/solidago/src/solidago/modules/scaling/lipschitz_standardize.py index 2b7277b274..3042a12819 100644 --- a/solidago/src/solidago/modules/scaling/lipschitz_standardize.py +++ b/solidago/src/solidago/modules/scaling/lipschitz_standardize.py @@ -19,7 +19,7 @@ def __init__(self, dev_quantile: float=0.9, lipschitz: float=0.1, error: float=1 def __call__(self, entities: Entities, user_models: UserModels) -> UserModels: scores = user_models.score(entities).reorder_keys(["criterion", "username", "entity_name"]) - scales = ScaleDict(key_names=["criterion"]) # the same scale will apply to all users + multiplicators = MultiScore() for criterion in scores.get_set("criterion"): scores_df = scores.to_df() weights = 1 / scores_df.groupby("username").transform("size") @@ -33,9 +33,9 @@ def __call__(self, entities: Entities, user_models: UserModels) -> UserModels: default_dev=1.0, error=self.error, ) - scales[criterion] = (1 / std_dev, 0, 0, 0, 0, 0) + multiplicators.set(criterion, 1 / std_dev) return UserModels({ - username: ScaledModel(model, scales, note="standardize") + username: ScaledModel(model, multiplicators=multiplicators, note="standardize") for username, model in user_models }) diff --git a/solidago/src/solidago/modules/scaling/mehestan.py b/solidago/src/solidago/modules/scaling/mehestan.py index ba519a5b1f..22762c1191 100644 --- a/solidago/src/solidago/modules/scaling/mehestan.py +++ b/solidago/src/solidago/modules/scaling/mehestan.py @@ -93,10 +93,17 @@ def __call__(self, scores = user_models.score(entities).reorder_keys(["criterion", "username", "entity_name"]) logger.info(f"Mehestan 0. Terminated") - users, scales = self.compute_scales(users, scores, made_public) + users, multiplicators, translations = self.compute_scales(users, scores, made_public) + multiplicators = multiplicators.groupby(["username"]) + translations = translations.groupby(["username"]) return users, UserModels({ - username: ScaledModel(model, scales[username], note="Mehestan") + username: ScaledModel( + model, + multiplicators=multiplicators[username], + translations=translations[username], + note="Mehestan" + ) for username, model in user_models }) @@ -105,7 +112,7 @@ def compute_scales(self, users: Users, # Must have column "trust_score" scores: MultiScore, # key_names == ["criterion", "username", "entity_name"] made_public: MadePublic, # key_names == ["username", "entity_name"] - ) -> ScaleDict: # key_names == ["username", "criterion"] + ) -> tuple[Users, MultiScore, MultiScore]: # key_names == ["username", "criterion"] """ Compute the scales for all criteria. This method should be inherited and parallelized for heavy-load applications that can afford multi-core operations. @@ -123,10 +130,15 @@ def compute_scales(self, scales: ScaleDict With key_names == ["username", "criterion"] """ - scales = ScaleDict(key_names=["criterion", "username"]) + multiplicators = MultiScore(key_names=["criterion", "username"]) + translations = MultiScore(key_names=["criterion", "username"]) for criterion in scores.get_set("criterion"): - users, scales[criterion] = self.scale_criterion(users, scores[criterion], made_public, criterion) - return users, scales.reorder_keys(["username", "criterion"]) + users, m, t = self.scale_criterion(users, scores[criterion], made_public, criterion) + m["criterion"] = criterion + t["criterion"] = criterion + multiplicators = multiplicators | m + translations = translations | t + return users, multiplicators, translations def scale_criterion(self, users: Users, diff --git a/solidago/src/solidago/state/__init__.py b/solidago/src/solidago/state/__init__.py index f01f47efa9..a290b2bab1 100644 --- a/solidago/src/solidago/state/__init__.py +++ b/solidago/src/solidago/state/__init__.py @@ -21,10 +21,8 @@ "Comparison", "Comparisons", "VotingRights", "Score", "MultiScore", - "ScoringModel", "BaseModel", - "UserModels", - "DirectScoring", - "ScaleDict", "ScaledModel", + "ScoringModel", "DirectScoring", "ScaledModel", "PostProcessedModel", "SquashedModel", + "UserModels", "State", "TournesolExport", ] diff --git a/solidago/src/solidago/state/models/__init__.py b/solidago/src/solidago/state/models/__init__.py index f63139e7e2..eaa3fa60ca 100644 --- a/solidago/src/solidago/state/models/__init__.py +++ b/solidago/src/solidago/state/models/__init__.py @@ -1,6 +1,6 @@ from .score import Score, MultiScore -from .base import ScoringModel, BaseModel +from .base import ScoringModel from .user_models import UserModels from .direct import DirectScoring -from .scaled import ScaleDict, ScaledModel +from .scaled import ScaledModel from .post_processed import PostProcessedModel, SquashedModel diff --git a/solidago/src/solidago/state/models/base.py b/solidago/src/solidago/state/models/base.py index 6e3719817a..8f2196888a 100644 --- a/solidago/src/solidago/state/models/base.py +++ b/solidago/src/solidago/state/models/base.py @@ -15,6 +15,7 @@ class ScoringModel(ABC): saved_argsnames: list[str]=["note"] def __init__(self, + parent: Optional["ScoringModel"]=None, dataframes: Optional[dict[str, DataFrame]]=None, depth: int=0, note: str="None", @@ -34,15 +35,16 @@ def __init__(self, if name == "directs": self.dfs[name] = MultiScore(df, key_names=["entity_name", "criterion"]) elif name == "multiplicators" and isinstance(df, DataFrame): - self.dfs[name] = MultiScore(df, key_names=["depth", "criterion"]).groupby(["depth"]) + self.dfs[name] = MultiScore(df, key_names=["depth", "criterion"]) elif name == "translations" and isinstance(df, DataFrame): - self.dfs[name] = MultiScore(df, key_names=["depth", "criterion"]).groupby(["depth"]) - if "parent" in kwargs: - if isinstance(kwargs["parent"], ScoringModel): - self.parent = kwargs["parent"] - elif isinstance(kwargs["parent"], ScoringModel): + self.dfs[name] = MultiScore(df, key_names=["depth", "criterion"]) + if parent is not None: + if isinstance(parent, ScoringModel): + parent.set_depth(depth + 1) + self.parent = parent + elif isinstance(parent, ScoringModel): import solidago.state.models as models - parent_cls, parent_kwargs = kwargs["parent"] + parent_cls, parent_kwargs = parent self.parent = getattr(models, parent_cls)(self.dfs, depth + 1, **parent_kwargs) else: raise ValueError(f"{kwargs['parent']} has unhandled type {type(kwargs['parent'])}") @@ -82,6 +84,11 @@ def is_base(self) -> bool: def evaluated_entities(self, entities: "Entities") -> "Entities": return entities if self.is_base() else self.parent.evaluated_entities(entities) + def set_depth(self, depth: int) -> None: + self.depth = depth + if not self.is_base(): + self.parent.set_depth(depth + 1) + def to_direct(self, entities: "Entities") -> "DirectScoring": from .direct import DirectScoring direct_scoring = DirectScoring() diff --git a/solidago/src/solidago/state/models/direct.py b/solidago/src/solidago/state/models/direct.py index fde7265a44..7ca8872bf9 100644 --- a/solidago/src/solidago/state/models/direct.py +++ b/solidago/src/solidago/state/models/direct.py @@ -3,7 +3,7 @@ from pandas import DataFrame, Series from .score import Score, MultiScore -from .base import ScoringModel, BaseModel +from .base import ScoringModel class DirectScoring(ScoringModel): diff --git a/solidago/src/solidago/state/models/scaled.py b/solidago/src/solidago/state/models/scaled.py index 8cab6d4337..9fe641699d 100644 --- a/solidago/src/solidago/state/models/scaled.py +++ b/solidago/src/solidago/state/models/scaled.py @@ -8,17 +8,36 @@ class ScaledModel(ScoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.multiplicators = self.dfs["multiplicators"][str(self.depth)] - self.translations = self.dfs["translations"][str(self.depth)] + def __init__(self, + parent: ScoringModel, + dataframes: Optional[dict[str, DataFrame]]=None, + depth: int=0, + note: str="None", + multipliers: Optional[MultiScore]=None, + translations: Optional[MultiScore]=None, + *args, + **kwargs + ): + super().__init__(parent=parent, *args, **kwargs) + if "multipliers" not in self.dfs: + self.dfs["multipliers"] = MultiScore(key_names=["depth", "criterion"]) + if "translations" not in self.dfs: + self.dfs["translations"] = MultiScore(key_names=["depth", "criterion"]) + + @property + def multiplier(self) -> MultiScore: + return self.dfs["multipliers"].get(depth=self.depth) + + @property + def translation(self) -> MultiScore: + return self.dfs["translation"].get(depth=self.depth) def score(self, entity: "Entity") -> MultiScore: return self.scale(self.parent.score(entity)) def scale(self, score: MultiScore) -> MultiScore: - return self.multiplicator * score + self.translation + return self.multiplier * score + self.translation - def rescale(self, multiplicator: Score, translation: Score) -> None: - self.multiplicator *= multiplicator - self.translation = multiplicator * self.translation + translation + def rescale(self, multiplier: Score, translation: Score) -> None: + self.multiplier *= multiplier + self.translation = multiplier * self.translation + translation diff --git a/solidago/src/solidago/state/models/score.py b/solidago/src/solidago/state/models/score.py index 9f52162a5d..daf75e83c0 100644 --- a/solidago/src/solidago/state/models/score.py +++ b/solidago/src/solidago/state/models/score.py @@ -1,6 +1,7 @@ import math from typing import Optional, Union, Any +from pandas import Series, DataFrame from solidago.primitives.datastructure import UnnamedDataFrame @@ -172,15 +173,27 @@ def input2dict(self, *args, keys_only: bool=False, **kwargs) -> dict: key_value_columns = self.key_names if keys_only else (self.key_names + self.value_names) if keys_only: args = args[:len(self.key_names)] - assert len(args) <= len(key_value_columns) + 1 + assert len(args) <= len(key_value_columns) + 3 assert all({ key not in key_value_columns[:len(args)] for key in kwargs }) f = lambda v, k: str(v) if k in self.key_names else v + kwargs = { k: f(v, k) for k, v in kwargs.items() if (not keys_only or k in self.key_names) } + args_key_names = [ kn for kn in self.key_names if kn not in kwargs ] + kwargs |= { k: f(v, k) for k, v in zip(args_key_names, args[:len(args_key_names)]) } + args_values = args[len(args_key_names):] + if len(args_values) > 0 and isinstance(args_values[0], Score): + assert "score" not in kwargs + kwargs["score"] = args_values[0] + elif len(args_values) > 0: + assert "score" not in kwargs + if len(args_values) == 1: + args_values = (args_values[0], 0, 0) + assert len(args_values) == 3, args + kwargs["score"] = Score(*args_values) if "score" in kwargs: kwargs["value"] = kwargs["score"].value kwargs["left_unc"] = kwargs["score"].left_unc kwargs["right_unc"] = kwargs["score"].right_unc del kwargs["score"] - kwargs = { k: f(v, k) for k, v in kwargs.items() if (not keys_only or k in self.key_names) } if not self.value_names and len(args) > len(self.key_names): assert len(args) == len(self.key_names) + 1 return kwargs | args[-1].to_dict() @@ -205,7 +218,7 @@ def __add__(self, other: Union[Score, "MultiScore"]) -> "MultiScore": assert self.key_names == other.key_names keys = set(self["criterion"]) & set(other["criterion"]) return MultiScore( - data=[ (*tuple(key), *(self[key] + other[key]).to_triplet() for key in keys ], + data=[ (*tuple(key), *(self[key] + other[key]).to_triplet()) for key in keys ], key_names=self.key_names ) @@ -218,7 +231,7 @@ def __sub__(self, other: Union[Score, "MultiScore"]) -> "MultiScore": assert self.key_names == other.key_names keys = set(self["criterion"]) & set(other["criterion"]) return MultiScore( - data=[ (*tuple(key), *(self[key] - other[key]).to_triplet() for key in keys ], + data=[ (*tuple(key), *(self[key] - other[key]).to_triplet()) for key in keys ], key_names=self.key_names ) @@ -231,7 +244,7 @@ def __mul__(self, other: Union[Score, "MultiScore"]) -> "MultiScore": assert self.key_names == other.key_names keys = set(self["criterion"]) & set(other["criterion"]) return MultiScore( - data=[ (*tuple(key), *(self[key] * other[key]).to_triplet() for key in keys ], + data=[ (*tuple(key), *(self[key] * other[key]).to_triplet()) for key in keys ], key_names=self.key_names ) @@ -244,6 +257,6 @@ def __truediv__(self, other: Union[Score, "MultiScore"]) -> "MultiScore": assert self.key_names == other.key_names keys = set(self["criterion"]) & set(other["criterion"]) return MultiScore( - data=[ (*tuple(key), *(self[key] / other[key]).to_triplet() for key in keys ], + data=[ (*tuple(key), *(self[key] / other[key]).to_triplet()) for key in keys ], key_names=self.key_names ) diff --git a/solidago/src/solidago/state/models/user_models.py b/solidago/src/solidago/state/models/user_models.py index cad6eb77b0..e008de128f 100644 --- a/solidago/src/solidago/state/models/user_models.py +++ b/solidago/src/solidago/state/models/user_models.py @@ -9,13 +9,31 @@ from .direct import DirectScoring -class UserModels(dict): - def __init__(self, *args, default_model_cls: type=DirectScoring, **kwargs): +class UserModels: + def __init__(self, + dataframes: Optional[dict[str, DataFrame]]=None, + default_model_cls: type=DirectScoring, + *args, + **kwargs + ): """ Maps usernames to ScoringModel objects. Useful to export/import `glued` directs / scalings dataframes. """ - super().__init__(*args, **kwargs) + self.dfs = dict() if dataframes is None else dataframes + for name, df in self.dfs.items(): + if isinstance(df, (str, Path)): + df_filename = df + try: df = pd.read_csv(df_filename, keep_default_na=False) + except pd.errors.EmptyDataError: df = DataFrame() + self.dfs[name] = df + if name == "directs": + self.dfs[name] = MultiScore(df, key_names=["username", "entity_name", "criterion"]) + elif name == "multiplicators" and isinstance(df, DataFrame): + self.dfs[name] = MultiScore(df, key_names=["username", "depth", "criterion"]) + elif name == "translations" and isinstance(df, DataFrame): + self.dfs[name] = MultiScore(df, key_names=["username", "depth", "criterion"]) self.default_model_cls = default_model_cls - + self._groups = None + def default_value(self) -> ScoringModel: return self.default_model_cls()