From 252bd7d939b033481fb1c4245898b2dfcb0e2d6f Mon Sep 17 00:00:00 2001 From: Mauko Quiroga Date: Fri, 11 Oct 2024 18:18:46 +0200 Subject: [PATCH] feat: add core population (#1238 #1276 #1277) --- openfisca_core/holders/__init__.py | 18 +- openfisca_core/holders/holder.py | 3 +- openfisca_core/holders/types.py | 3 + openfisca_core/populations/__init__.py | 6 + .../populations/_core_population.py | 241 ++++++++++++++++++ openfisca_core/populations/_errors.py | 15 ++ .../populations/group_population.py | 7 +- openfisca_core/populations/population.py | 170 +----------- openfisca_core/populations/types.py | 72 ++++++ openfisca_core/types.py | 4 +- 10 files changed, 366 insertions(+), 173 deletions(-) create mode 100644 openfisca_core/holders/types.py create mode 100644 openfisca_core/populations/_core_population.py create mode 100644 openfisca_core/populations/_errors.py create mode 100644 openfisca_core/populations/types.py diff --git a/openfisca_core/holders/__init__.py b/openfisca_core/holders/__init__.py index c8422af7d..127d7fa92 100644 --- a/openfisca_core/holders/__init__.py +++ b/openfisca_core/holders/__init__.py @@ -21,9 +21,15 @@ # # See: https://www.python.org/dev/peps/pep-0008/#imports -from .helpers import ( # noqa: F401 - set_input_dispatch_by_period, - set_input_divide_by_period, -) -from .holder import Holder # noqa: F401 -from .memory_usage import MemoryUsage # noqa: F401 +from . import types +from .helpers import set_input_dispatch_by_period, set_input_divide_by_period +from .holder import Holder +from .memory_usage import MemoryUsage + +__all__ = [ + "Holder", + "MemoryUsage", + "set_input_dispatch_by_period", + "set_input_divide_by_period", + "types", +] diff --git a/openfisca_core/holders/holder.py b/openfisca_core/holders/holder.py index 7183d4a44..e79ef9ad2 100644 --- a/openfisca_core/holders/holder.py +++ b/openfisca_core/holders/holder.py @@ -18,6 +18,7 @@ types, ) +from . import types as t from .memory_usage import MemoryUsage @@ -45,7 +46,7 @@ def __init__(self, variable, population) -> None: if self.variable.name in self.simulation.memory_config.variables_to_drop: self._do_not_store = True - def clone(self, population): + def clone(self, population: t.CorePopulation) -> t.Holder: """Copy the holder just enough to be able to run a new simulation without modifying the original simulation.""" new = commons.empty_clone(self) new_dict = new.__dict__ diff --git a/openfisca_core/holders/types.py b/openfisca_core/holders/types.py new file mode 100644 index 000000000..355fe71e4 --- /dev/null +++ b/openfisca_core/holders/types.py @@ -0,0 +1,3 @@ +from openfisca_core.types import CorePopulation, Holder + +__all__ = ["CorePopulation", "Holder"] diff --git a/openfisca_core/populations/__init__.py b/openfisca_core/populations/__init__.py index 0047c528b..172706048 100644 --- a/openfisca_core/populations/__init__.py +++ b/openfisca_core/populations/__init__.py @@ -29,6 +29,9 @@ ) from openfisca_core.projectors.helpers import get_projector_from_shortcut, projectable +from . import types +from ._core_population import CorePopulation +from ._errors import InvalidArraySizeError from .config import ADD, DIVIDE from .group_population import GroupPopulation from .population import Population @@ -36,12 +39,15 @@ __all__ = [ "ADD", "DIVIDE", + "CorePopulation", "EntityToPersonProjector", "FirstPersonToEntityProjector", "GroupPopulation", + "InvalidArraySizeError", "Population", "Projector", "UniqueRoleToEntityProjector", "get_projector_from_shortcut", "projectable", + "types", ] diff --git a/openfisca_core/populations/_core_population.py b/openfisca_core/populations/_core_population.py new file mode 100644 index 000000000..908fd3654 --- /dev/null +++ b/openfisca_core/populations/_core_population.py @@ -0,0 +1,241 @@ +from __future__ import annotations + +from collections.abc import Sequence +from typing import NamedTuple, TypeVar +from typing_extensions import TypedDict + +import enum +import traceback + +import numpy +import strenum + +from openfisca_core import holders, periods + +from . import types as t +from ._errors import InvalidArraySizeError + +#: Type variable for a covariant data type. +_DT_co = TypeVar("_DT_co", covariant=True, bound=t.VarDType) + + +class Option(strenum.StrEnum): + ADD = enum.auto() + DIVIDE = enum.auto() + + +class Calculate(NamedTuple): + variable: str + period: t.Period + option: Sequence[str] | None + + +class MemoryUsageByVariable(TypedDict, total=False): + by_variable: dict[str, holders.MemoryUsage] + total_nb_bytes: int + + +class CorePopulation: + """Base class to build populations from. + + Args: + entity: The :class:`.CoreEntity` of the population. + *__args: Variable length argument list. + **__kwds: Arbitrary keyword arguments. + + """ + + #: ??? + count: int = 0 + + #: The :class:`.CoreEntity` of the population. + entity: t.CoreEntity + + #: ??? + ids: Sequence[str] = [] + + #: ??? + simulation: None | t.Simulation = None + + def __init__(self, entity: t.CoreEntity, *__args: object, **__kwds: object) -> None: + self.entity = entity + self._holders: t.Holders = {} + + def __call__( + self, + variable_name: t.VariableName, + period: None | t.PeriodLike = None, + options: None | Sequence[str] = None, + ) -> None | t.FloatArray: + """Calculate ``variable_name`` for ``period``, using the formula if it exists. + + Example: + >>> person("salary", "2017-04") + >>> array([300.0]) + + Returns: + None: If there is no :class:`.Simulation`. + ndarray[float32]: The result of the calculation. + + """ + if self.simulation is None: + return None + + calculate: Calculate = Calculate( + variable=variable_name, + period=periods.period(period), + option=options, + ) + + self.entity.check_variable_defined_for_entity(calculate.variable) + self.check_period_validity(calculate.variable, calculate.period) + + if not isinstance(calculate.option, Sequence): + return self.simulation.calculate( + calculate.variable, + calculate.period, + ) + + if Option.ADD in map(str.upper, calculate.option): + return self.simulation.calculate_add( + calculate.variable, + calculate.period, + ) + + if Option.DIVIDE in map(str.upper, calculate.option): + return self.simulation.calculate_divide( + calculate.variable, + calculate.period, + ) + + raise ValueError( + f"Options config.ADD and config.DIVIDE are incompatible (trying to compute variable {variable_name})".encode(), + ) + + def empty_array(self) -> t.FloatArray: + """Return an empty array. + + Returns: + ndarray[float32]: An empty array. + + Examples: + >>> import numpy + + >>> from openfisca_core import populations as p + + >>> class Population(p.CorePopulation): ... + + >>> population = Population(None) + >>> population.empty_array() + array([], dtype=float32) + + >>> population.count = 3 + >>> population.empty_array() + array([0., 0., 0.], dtype=float32) + + """ + return numpy.zeros(self.count, dtype=t.FloatDType) + + def filled_array( + self, value: _DT_co, dtype: None | t.DTypeLike = None + ) -> t.Array[_DT_co]: + """Return an array filled with a value. + + Args: + value: The value to fill the array with. + dtype: The data type of the array. + + Returns: + ndarray[generic]: An array filled with the value. + + Examples: + >>> import numpy + + >>> from openfisca_core import populations as p + + >>> class Population(p.CorePopulation): ... + + >>> population = Population(None) + >>> population.count = 3 + >>> population.filled_array(1) + array([1, 1, 1]) + + >>> population.filled_array(numpy.float32(1)) + array([1., 1., 1.], dtype=float32) + + >>> population.filled_array(1, dtype=str) + array(['1', '1', '1'], dtype='>> population.filled_array("hola", dtype=numpy.uint8) + Traceback (most recent call last): + ValueError: could not convert string to float: 'hola' + + """ + return numpy.full(self.count, value, dtype) + + def get_index(self, id: str) -> int: + return self.ids.index(id) + + # Calculations + + def check_array_compatible_with_entity(self, array: t.FloatArray) -> None: + if self.count == array.size: + return + raise InvalidArraySizeError(array, self.entity.key, self.count) + + def check_period_validity( + self, + variable_name: str, + period: int | str | Period | None, + ) -> None: + if isinstance(period, (int, str, periods.Period)): + return + + stack = traceback.extract_stack() + filename, line_number, function_name, line_of_code = stack[-3] + msg = f""" +You requested computation of variable "{variable_name}", but you did not specify on which period in "{filename}:{line_number}": + {line_of_code} +When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example: + computed_salary = person('salary', period). +See more information at . +""" + raise ValueError( + msg, + ) + + # Helpers + + def get_holder(self, variable_name: t.VariableName) -> t.Holder: + self.entity.check_variable_defined_for_entity(variable_name) + holder = self._holders.get(variable_name) + if holder: + return holder + variable = self.entity.get_variable(variable_name) + self._holders[variable_name] = holder = holders.Holder(variable, self) + return holder + + def get_memory_usage( + self, + variables: Sequence[str] | None = None, + ) -> MemoryUsageByVariable: + holders_memory_usage = { + variable_name: holder.get_memory_usage() + for variable_name, holder in self._holders.items() + if variables is None or variable_name in variables + } + + total_memory_usage = sum( + holder_memory_usage["total_nb_bytes"] + for holder_memory_usage in holders_memory_usage.values() + ) + + return MemoryUsageByVariable( + { + "total_nb_bytes": total_memory_usage, + "by_variable": holders_memory_usage, + }, + ) + + +__all__ = ["CorePopulation"] diff --git a/openfisca_core/populations/_errors.py b/openfisca_core/populations/_errors.py new file mode 100644 index 000000000..a664ad71f --- /dev/null +++ b/openfisca_core/populations/_errors.py @@ -0,0 +1,15 @@ +from . import types as t + + +class InvalidArraySizeError(ValueError): + """Raised when an array has an invalid size.""" + + def __init__(self, array: t.FloatArray, entity: t.EntityKey, count: int) -> None: + msg = ( + f"Input {array} is not a valid value for the entity {entity} " + f"(size = {array.size} != {count} = count)." + ) + super().__init__(msg) + + +__all__ = ["InvalidArraySizeError"] diff --git a/openfisca_core/populations/group_population.py b/openfisca_core/populations/group_population.py index 3b3384238..120dc9c65 100644 --- a/openfisca_core/populations/group_population.py +++ b/openfisca_core/populations/group_population.py @@ -1,14 +1,17 @@ +from __future__ import annotations + import typing import numpy from openfisca_core import entities, indexed_enums, projectors +from . import types as t from .population import Population class GroupPopulation(Population): - def __init__(self, entity, members) -> None: + def __init__(self, entity: t.GroupEntity, members: t.Members) -> None: super().__init__(entity) self.members = members self._members_entity_id = None @@ -78,6 +81,8 @@ def ordered_members_map(self): self._ordered_members_map = numpy.argsort(self.members_entity_id) return self._ordered_members_map + # Helpers + def get_role(self, role_name): return next( (role for role in self.entity.flattened_roles if role.key == role_name), diff --git a/openfisca_core/populations/population.py b/openfisca_core/populations/population.py index 06acc05d2..0f5275cfa 100644 --- a/openfisca_core/populations/population.py +++ b/openfisca_core/populations/population.py @@ -1,35 +1,18 @@ from __future__ import annotations -from collections.abc import Sequence -from typing import NamedTuple -from typing_extensions import TypedDict - -from openfisca_core.types import Array, Period, Role, Simulation, SingleEntity - -import traceback - import numpy -from openfisca_core import holders, periods, projectors - -from . import config +from openfisca_core import projectors +from . import types as t +from ._core_population import CorePopulation -class Population: - simulation: Simulation | None - entity: SingleEntity - _holders: dict[str, holders.Holder] - count: int - ids: Array[str] - def __init__(self, entity: SingleEntity) -> None: - self.simulation = None - self.entity = entity - self._holders = {} - self.count = 0 - self.ids = [] +class Population(CorePopulation): + def __init__(self, entity: t.SingleEntity) -> None: + super().__init__(entity) - def clone(self, simulation: Simulation) -> Population: + def clone(self, simulation: Simulation) -> t.CorePopulation: result = Population(self.entity) result.simulation = simulation result._holders = { @@ -40,16 +23,6 @@ def clone(self, simulation: Simulation) -> Population: result.ids = self.ids return result - def empty_array(self) -> Array[float]: - return numpy.zeros(self.count) - - def filled_array( - self, - value: float | bool, - dtype: numpy.dtype | None = None, - ) -> Array[float] | Array[bool]: - return numpy.full(self.count, value, dtype) - def __getattr__(self, attribute: str) -> projectors.Projector: projector: projectors.Projector | None projector = projectors.get_projector_from_shortcut(self, attribute) @@ -62,126 +35,8 @@ def __getattr__(self, attribute: str) -> projectors.Projector: msg, ) - def get_index(self, id: str) -> int: - return self.ids.index(id) - - # Calculations - - def check_array_compatible_with_entity( - self, - array: Array[float], - ) -> None: - if self.count == array.size: - return - - msg = f"Input {array} is not a valid value for the entity {self.entity.key} (size = {array.size} != {self.count} = count)" - raise ValueError( - msg, - ) - - def check_period_validity( - self, - variable_name: str, - period: int | str | Period | None, - ) -> None: - if isinstance(period, (int, str, periods.Period)): - return - - stack = traceback.extract_stack() - filename, line_number, function_name, line_of_code = stack[-3] - msg = f""" -You requested computation of variable "{variable_name}", but you did not specify on which period in "{filename}:{line_number}": - {line_of_code} -When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example: - computed_salary = person('salary', period). -See more information at . -""" - raise ValueError( - msg, - ) - - def __call__( - self, - variable_name: str, - period: int | str | Period | None = None, - options: Sequence[str] | None = None, - ) -> Array[float] | None: - """Calculate the variable ``variable_name`` for the entity and the period ``period``, using the variable formula if it exists. - - Example: - >>> person("salary", "2017-04") - >>> array([300.0]) - - :returns: A numpy array containing the result of the calculation - - """ - if self.simulation is None: - return None - - calculate: Calculate = Calculate( - variable=variable_name, - period=periods.period(period), - option=options, - ) - - self.entity.check_variable_defined_for_entity(calculate.variable) - self.check_period_validity(calculate.variable, calculate.period) - - if not isinstance(calculate.option, Sequence): - return self.simulation.calculate( - calculate.variable, - calculate.period, - ) - - if config.ADD in calculate.option: - return self.simulation.calculate_add( - calculate.variable, - calculate.period, - ) - - if config.DIVIDE in calculate.option: - return self.simulation.calculate_divide( - calculate.variable, - calculate.period, - ) - - raise ValueError( - f"Options config.ADD and config.DIVIDE are incompatible (trying to compute variable {variable_name})".encode(), - ) - # Helpers - def get_holder(self, variable_name: str) -> holders.Holder: - self.entity.check_variable_defined_for_entity(variable_name) - holder = self._holders.get(variable_name) - if holder: - return holder - variable = self.entity.get_variable(variable_name) - self._holders[variable_name] = holder = holders.Holder(variable, self) - return holder - - def get_memory_usage( - self, - variables: Sequence[str] | None = None, - ) -> MemoryUsageByVariable: - holders_memory_usage = { - variable_name: holder.get_memory_usage() - for variable_name, holder in self._holders.items() - if variables is None or variable_name in variables - } - - total_memory_usage = sum( - holder_memory_usage["total_nb_bytes"] - for holder_memory_usage in holders_memory_usage.values() - ) - - return MemoryUsageByVariable( - { - "total_nb_bytes": total_memory_usage, - "by_variable": holders_memory_usage, - }, - ) - @projectors.projectable def has_role(self, role: Role) -> Array[bool] | None: """Check if a person has a given role within its `GroupEntity`. @@ -285,14 +140,3 @@ def get_rank( # Return -1 for the persons who don't respect the condition return numpy.where(condition, result, -1) - - -class Calculate(NamedTuple): - variable: str - period: Period - option: Sequence[str] | None - - -class MemoryUsageByVariable(TypedDict, total=False): - by_variable: dict[str, holders.MemoryUsage] - total_nb_bytes: int diff --git a/openfisca_core/populations/types.py b/openfisca_core/populations/types.py new file mode 100644 index 000000000..c0056497d --- /dev/null +++ b/openfisca_core/populations/types.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from collections.abc import Iterable, MutableMapping +from typing import Union +from typing_extensions import NewType, TypeAlias + +from openfisca_core.types import ( + Array, + CoreEntity, + CorePopulation, + DTypeLike, + EntityKey, + GroupEntity, + Holder, + Period, + PeriodStr, + Simulation, + SingleEntity, + SinglePopulation, + VariableName, +) + +from numpy import ( + bool_ as BoolDType, + float32 as FloatDType, + generic as VarDType, + str_ as StrDType, +) + +# Commons + +#: Type alias for an array of strings. +StrArray: TypeAlias = Array[StrDType] + +#: Type alias for an array of booleans. +BoolArray: TypeAlias = Array[BoolDType] + +#: Type alias for an array of floats. +FloatArray: TypeAlias = Array[FloatDType] + +# Periods + +#: New type for a period integer. +PeriodInt = NewType("PeriodInt", int) + +#: Type alias for a period-like object. +PeriodLike: TypeAlias = Union[Period, PeriodStr, PeriodInt] + +# Populations + +#: Type alias for a population's holders. +Holders: TypeAlias = MutableMapping[VariableName, Holder] + +# TODO(Mauko Quiroga-Alvarado): I'm not sure if this type alias is correct. +# https://openfisca.org/doc/coding-the-legislation/50_entities.html +Members: TypeAlias = Iterable[SinglePopulation] + + +__all__ = [ + "CoreEntity", + "CorePopulation", + "DTypeLike", + "EntityKey", + "GroupEntity", + "Holder", + "Period", + "Simulation", + "SingleEntity", + "SinglePopulation", + "VarDType", + "VariableName", +] diff --git a/openfisca_core/types.py b/openfisca_core/types.py index b79504c72..04e2a7abb 100644 --- a/openfisca_core/types.py +++ b/openfisca_core/types.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence, Sized +from collections.abc import Iterable, Mapping, Sequence, Sized from numpy.typing import DTypeLike, NDArray from typing import NewType, TypeVar, Union from typing_extensions import Protocol, Self, TypeAlias @@ -133,7 +133,7 @@ def __new__( class Holder(Protocol): def clone(self, population: CorePopulation, /) -> Holder: ... - def get_memory_usage(self, /) -> dict[str, object]: ... + def get_memory_usage(self, /) -> Mapping[str, object]: ... # Parameters