ENH: support elimination of groups of products for second choice data

jeffgortmaker · Feb 25, 2023 · 15010d1 · 15010d1
1 parent bee963a
commit 15010d1
Show file tree

Hide file tree

Showing 11 changed files with 255 additions and 94 deletions.
diff --git a/docs/background.rst b/docs/background.rst
@@ -295,7 +295,9 @@ Micro moment parts based on second choice are averages over values :math:`v_{pij
 
 Its simulated analogue is
 
-.. math:: v_p = \frac{\sum_{t \in T} \sum_{i \in I_t} \sum_{j, k \in J_t \cup \{0\}} w_{it} s_{ijt} s_{ik(-j)t} w_{d_pijkt} v_{pijkt}}{\sum_{t \in T} \sum_{i \in I_t} \sum_{j, k \in J_t \cup \{0\}} w_{it} s_{ijt} s_{ik(-j)t} w_{d_pijkt}}.
+.. math:: v_p = \frac{\sum_{t \in T} \sum_{i \in I_t} \sum_{j, k \in J_t \cup \{0\}} w_{it} s_{ijt} s_{ik(-j)t} w_{d_pijkt} v_{pijkt}}{\sum_{t \in T} \sum_{i \in I_t} \sum_{j, k \in J_t \cup \{0\}} w_{it} s_{ijt} s_{ik(-j)t} w_{d_pijkt}},
+
+in which :math:`s_{ik(-j)t}` is the probability of choosing :math:`k` when :math:`j` is removed from the choice set. One can also define micro moment parts based on second choices where a group of products :math:`h(j)` containing the first choice :math:`j` is removed from the choice set. In this case, the above second choice probabilities become :math:`s_{ik(-h(j))t}`.
 
 Covariances are defined analogously.
 

diff --git a/pyblp/economies/economy.py b/pyblp/economies/economy.py
@@ -244,6 +244,16 @@ def _validate_name(self, name: Optional[str], none_valid: bool = True) -> None:
         if name not in names:
             raise NameError(f"'{name}' is not None or one of the underlying variables, {list(sorted(names))}.")
 
+    def _validate_product_ids_index(self, product_ids_index: int) -> None:
+        """Validate that a product IDs index is valid."""
+        if not isinstance(product_ids_index, int) or product_ids_index < 0:
+            raise ValueError("The product IDs index must be a non-negative int.")
+        if self.products.product_ids.size == 0:
+            raise ValueError("Since the product IDs index is not None, product_data must have product_ids.")
+        max_index = self.products.product_ids.shape[1] - 1
+        if not 0 <= product_ids_index <= max_index:
+            raise ValueError(f"The product IDs index should be at most {max_index}.")
+
     def _coerce_optional_firm_ids(self, firm_ids: Optional[Any], market_ids: Optional[Array] = None) -> Array:
         """Coerce optional array-like firm IDs into a column vector and validate it. By default, assume that firm IDs
         are for all markets.

diff --git a/pyblp/economies/problem.py b/pyblp/economies/problem.py
@@ -1279,7 +1279,8 @@ class Problem(ProblemEconomy):
 
         It may be convenient to define IDs for different products:
 
-            - **product_ids** (`object, optional`) - IDs that identify products within markets.
+            - **product_ids** (`object, optional`) - IDs that identify products within markets. There can be multiple
+              columns.
 
         Finally, clustering groups can be specified to account for within-group correlation while updating the weighting
         matrix and estimating standard errors:

diff --git a/pyblp/economies/simulation.py b/pyblp/economies/simulation.py
@@ -106,7 +106,8 @@ class Simulation(Economy):
 
         It may be convenient to define IDs for different products:
 
-            - **product_ids** (`object, optional`) - IDs that identify products within markets.
+            - **product_ids** (`object, optional`) - IDs that identify products within markets. There can be multiple
+              columns.
 
         To simulate a nested logit or random coefficients nested logit (RCNL) model, nesting groups must be specified:
 

diff --git a/pyblp/markets/economy_results_market.py b/pyblp/markets/economy_results_market.py
@@ -353,8 +353,8 @@ def safely_compute_profits(
 
     @NumericalErrorHandler(exceptions.PostEstimationNumericalError)
     def safely_compute_consumer_surplus(
-            self, keep_all: bool, eliminate_product_ids: Optional[Any], prices: Optional[Array]) -> (
-            Tuple[Array, List[Error]]):
+            self, keep_all: bool, eliminate_product_ids: Optional[Any], product_ids_index: int,
+            prices: Optional[Array]) -> Tuple[Array, List[Error]]:
         """Estimate population-normalized consumer surplus or keep all individual-level surpluses. By default, use
         unchanged prices, handling any numerical errors.
         """
@@ -379,7 +379,7 @@ def safely_compute_consumer_surplus(
 
         # eliminate any products from the choice set
         if eliminate_product_ids is not None:
-            for j, product_id in enumerate(self.products.product_ids):
+            for j, product_id in enumerate(self.products.product_ids[:, product_ids_index]):
                 if product_id in eliminate_product_ids:
                     exp_utilities[j] = 0
 

diff --git a/pyblp/markets/market.py b/pyblp/markets/market.py
diff --git a/pyblp/micro.py b/pyblp/micro.py
@@ -80,6 +80,12 @@ class MicroDataset(StringRepresentation):
             issue, consider setting ``pyblp.options.micro_computation_chunks`` to a value higher than its default of
             ``1``, such as the highest :math:`J_t`. This will cut down on memory usage without much affecting speed.
 
+    eliminated_product_ids_index : `int, optional`
+        This option determines whether the dataset's second choices are after only the first choice product :math:`j` is
+        eliminated from the choice set, in which case this should be ``None``, the default, or if a group of products
+        including the first choice product is eliminated, in which case this should be a number between ``0`` and the
+        number of columns in the ``product_ids`` field of ``product_data`` minus one, inclusive. The column of
+        ``product_ids`` determines the groups.
     market_ids : `array-like, optional`
         Distinct market IDs with nonzero survey weights :math:`w_{dijt}`. For other markets, :math:`w_{dijt} = 0`, and
         ``compute_weights`` will not be called.
@@ -94,9 +100,11 @@ class MicroDataset(StringRepresentation):
     observations: int
     compute_weights: functools.partial
     market_ids: Optional[Set]
+    eliminated_product_ids_index: Optional[int]
 
     def __init__(
             self, name: str, observations: int, compute_weights: Callable,
+            eliminated_product_ids_index: Optional[int] = None,
             market_ids: Optional[Union[Sequence, Array]] = None) -> None:
         """Validate information to the greatest extent possible without an economy or calling the function."""
         if not isinstance(name, str):
@@ -110,6 +118,12 @@ def __init__(
         self.observations = observations
         self.compute_weights = functools.partial(compute_weights)
 
+        # validate the product IDs index
+        self.eliminated_product_ids_index = eliminated_product_ids_index
+        if eliminated_product_ids_index is not None:
+            if not isinstance(eliminated_product_ids_index, int) or eliminated_product_ids_index < 0:
+                raise ValueError("eliminated_product_ids_index must be None or a non-negative int.")
+
         # validate market IDs, checking for duplicates
         if market_ids is None:
             self.market_ids = None
@@ -128,7 +142,11 @@ def __str__(self) -> str:
         return f"{self.name}: {self.observations} Observations in {self._format_markets(text=True)}"
 
     def _validate(self, economy: 'Economy') -> None:
-        """Check that all market IDs associated with this dataset are in the economy."""
+        """Check that all market IDs associated with this dataset are in the economy and that any eliminated product
+        IDs index is valid.
+        """
+        if self.eliminated_product_ids_index is not None:
+            economy._validate_product_ids_index(self.eliminated_product_ids_index)
         if self.market_ids is not None:
             extra_ids = self.market_ids - set(economy.unique_market_ids)
             if extra_ids:

diff --git a/pyblp/primitives.py b/pyblp/primitives.py
@@ -168,8 +168,6 @@ def __new__(
             raise ValueError("The firm_ids field of product_data must be one-dimensional.")
         if nesting_ids is not None and nesting_ids.shape[1] > 1:
             raise ValueError("The nesting_ids field of product_data must be one-dimensional.")
-        if product_ids is not None and product_ids.shape[1] > 1:
-            raise ValueError("The product_ids field of product_data must be one-dimensional.")
         if clustering_ids is not None:
             if clustering_ids.shape[1] > 1:
                 raise ValueError("The clustering_ids field of product_data must be one-dimensional.")

diff --git a/pyblp/results/economy_results.py b/pyblp/results/economy_results.py
@@ -968,7 +968,7 @@ def compute_profit_hessians(
 
     def compute_consumer_surpluses(
             self, prices: Optional[Any] = None, keep_all: bool = False, eliminate_product_ids: Optional[Any] = None,
-            market_id: Optional[Any] = None) -> Array:
+            product_ids_index: int = 0, market_id: Optional[Any] = None) -> Array:
         r"""Estimate population-normalized consumer surpluses, :math:`\text{CS}`.
 
         Assuming away nonlinear income effects, the surplus in market :math:`t` is
@@ -1017,6 +1017,10 @@ def compute_consumer_surpluses(
             IDs of the products to eliminate from the choice set. These IDs should show up in the ``product_ids`` field
             of ``product_data`` in :class:`Problem`. Eliminating one or more products and comparing consumer surpluses
             gives a measure of willingness to pay for these products.
+        product_ids_index : `int, optional`
+            Index between ``0`` and the number of columns in the ``product_ids`` field of ``product_data`` minus one,
+            inclusive, which determines which column of product IDs ``eliminate_product_ids`` refers to. By default, it
+            refers to the first column, which is index ``0``.
         market_id : `object, optional`
             ID of the market in which to compute consumer surplus. By default, consumer surpluses are computed in all
             markets and stacked.
@@ -1036,11 +1040,13 @@ def compute_consumer_surpluses(
 
         """
         output("Computing consumer surpluses with the equation that assumes away nonlinear income effects ...")
+        if eliminate_product_ids is not None:
+            self._economy._validate_product_ids_index(product_ids_index)
         market_ids = self._select_market_ids(market_id)
         prices = self._coerce_optional_prices(prices, market_ids)
         return self._combine_arrays(
             EconomyResultsMarket.safely_compute_consumer_surplus, market_ids,
-            fixed_args=[keep_all, eliminate_product_ids], market_args=[prices]
+            fixed_args=[keep_all, eliminate_product_ids, product_ids_index], market_args=[prices]
         )
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -302,7 +302,7 @@ def large_blp_simulation() -> SimulationFixture:
         product_data={
             'market_ids': id_data.market_ids,
             'firm_ids': id_data.firm_ids,
-            'product_ids': product_ids,
+            'product_ids': np.c_[product_ids, np.mod(product_ids, 2)],
             'clustering_ids': np.random.RandomState(2).choice(range(30), id_data.size)
         },
         beta=[1, 1, 2, 3, 1],
@@ -339,12 +339,19 @@ def large_blp_simulation() -> SimulationFixture:
         ]
     }
 
-    inside_diversion_micro_dataset = MicroDataset(
+    inside_diversion_micro_dataset0 = MicroDataset(
         name="diversion from 1",
         observations=simulation.N,
-        compute_weights=lambda _, p, a: np.tile(p.product_ids == 1, (a.size, 1, 1 + p.size)),
+        compute_weights=lambda _, p, a: np.tile(p.product_ids[:, [0]] == 1, (a.size, 1, 1 + p.size)),
         market_ids=simulation.unique_market_ids[6:10],
     )
+    inside_diversion_micro_dataset1 = MicroDataset(
+        name="diversion from 1, grouped",
+        observations=simulation.N,
+        compute_weights=lambda _, p, a: np.tile(p.product_ids[:, [0]] == 1, (a.size, 1, 1 + p.size)),
+        market_ids=simulation.unique_market_ids[6:10],
+        eliminated_product_ids_index=1,
+    )
     outside_diversion_micro_dataset = MicroDataset(
         name="diversion from outside",
         observations=simulation.N,
@@ -362,7 +369,7 @@ def large_blp_simulation() -> SimulationFixture:
                 dataset=MicroDataset(
                     name="product 0",
                     observations=simulation.N,
-                    compute_weights=lambda _, p, a: np.tile(p.product_ids.flat == 0, (a.size, 1)),
+                    compute_weights=lambda _, p, a: np.tile(p.product_ids[:, 0] == 0, (a.size, 1)),
                 ),
                 compute_values=lambda _, p, a: np.tile(a.demographics[:, [1]], (1, p.size)),
             ),
@@ -376,7 +383,7 @@ def large_blp_simulation() -> SimulationFixture:
                     name="product 0 and outside",
                     observations=simulation.N,
                     compute_weights=lambda _, p, a: np.c_[
-                        np.ones((a.size, 1)), np.tile(p.product_ids.flat == 0, (a.size, 1))
+                        np.ones((a.size, 1)), np.tile(p.product_ids[:, 0] == 0, (a.size, 1))
                     ],
                     market_ids=simulation.unique_market_ids[1:4],
                 ),
@@ -390,32 +397,54 @@ def large_blp_simulation() -> SimulationFixture:
             value=0,
             parts=MicroPart(
                 name="1 to 0 diversion ratio",
-                dataset=inside_diversion_micro_dataset,
+                dataset=inside_diversion_micro_dataset0,
                 compute_values=lambda _, p, a: np.concatenate(
-                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids.flat == 0, (a.size, p.size, 1))], axis=2
+                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids[:, 0] == 0, (a.size, p.size, 1))], axis=2
                 ),
             ),
         ),
         MicroMoment(
-            name="outside to 0 diversion ratio",
+            name="1 to outside diversion ratio",
             value=0,
             parts=MicroPart(
-                name="outside to 0 diversion ratio",
-                dataset=outside_diversion_micro_dataset,
-                compute_values=lambda _, p, a: np.tile(p.product_ids.flat == 0, (a.size, 1 + p.size, 1)),
+                name="1 to outside diversion ratio",
+                dataset=inside_diversion_micro_dataset0,
+                compute_values=lambda _, p, a: np.concatenate(
+                    [np.ones((a.size, p.size, 1)), np.zeros((a.size, p.size, p.size))], axis=2
+                ),
             ),
         ),
         MicroMoment(
-            name="1 to outside diversion ratio",
+            name="1 to 0 diversion ratio, grouped",
             value=0,
             parts=MicroPart(
-                name="1 to outside diversion ratio",
-                dataset=inside_diversion_micro_dataset,
+                name="1 to 0 diversion ratio, grouped",
+                dataset=inside_diversion_micro_dataset1,
+                compute_values=lambda _, p, a: np.concatenate(
+                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids[:, 0] == 0, (a.size, p.size, 1))], axis=2
+                ),
+            ),
+        ),
+        MicroMoment(
+            name="1 to outside diversion ratio, grouped",
+            value=0,
+            parts=MicroPart(
+                name="1 to outside diversion ratio, grouped",
+                dataset=inside_diversion_micro_dataset1,
                 compute_values=lambda _, p, a: np.concatenate(
                     [np.ones((a.size, p.size, 1)), np.zeros((a.size, p.size, p.size))], axis=2
                 ),
             ),
         ),
+        MicroMoment(
+            name="outside to 0 diversion ratio",
+            value=0,
+            parts=MicroPart(
+                name="outside to 0 diversion ratio",
+                dataset=outside_diversion_micro_dataset,
+                compute_values=lambda _, p, a: np.tile(p.product_ids[:, 0] == 0, (a.size, 1 + p.size, 1)),
+            ),
+        ),
         MicroMoment(
             name="unconditional diversion interaction",
             value=0,
@@ -503,7 +532,7 @@ def large_nested_blp_simulation() -> SimulationFixture:
         product_data={
             'market_ids': id_data.market_ids,
             'firm_ids': id_data.firm_ids,
-            'product_ids': product_ids,
+            'product_ids': np.c_[product_ids, np.mod(product_ids, 2)],
             'nesting_ids': state.choice(['f', 'g', 'h'], id_data.size),
             'clustering_ids': state.choice(range(30), id_data.size),
         },
@@ -546,11 +575,18 @@ def large_nested_blp_simulation() -> SimulationFixture:
         dataset=inside_micro_dataset,
         compute_values=lambda _, p, a: np.tile((a.agent_ids == 2) | (a.agent_ids == 3), (1, p.size)),
     )
-    inside_diversion_micro_dataset = MicroDataset(
+    inside_diversion_micro_dataset0 = MicroDataset(
         name="diversion from 1",
         observations=simulation.N,
-        compute_weights=lambda _, p, a: np.tile(p.product_ids == 1, (a.size, 1, 1 + p.size)),
+        compute_weights=lambda _, p, a: np.tile(p.product_ids[:, [0]] == 1, (a.size, 1, 1 + p.size)),
+        market_ids=[simulation.unique_market_ids[2]],
+    )
+    inside_diversion_micro_dataset1 = MicroDataset(
+        name="diversion from 1, grouped",
+        observations=simulation.N,
+        compute_weights=lambda _, p, a: np.tile(p.product_ids[:, [0]] == 1, (a.size, 1, 1 + p.size)),
         market_ids=[simulation.unique_market_ids[2]],
+        eliminated_product_ids_index=1,
     )
     outside_diversion_micro_dataset = MicroDataset(
         name="diversion from outside",
@@ -616,9 +652,9 @@ def large_nested_blp_simulation() -> SimulationFixture:
             value=0,
             parts=MicroPart(
                 name="1 to 0 diversion ratio",
-                dataset=inside_diversion_micro_dataset,
+                dataset=inside_diversion_micro_dataset0,
                 compute_values=lambda _, p, a: np.concatenate(
-                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids.flat == 0, (a.size, p.size, 1))], axis=2
+                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids[:, 0] == 0, (a.size, p.size, 1))], axis=2
                 ),
             ),
         ),
@@ -627,7 +663,29 @@ def large_nested_blp_simulation() -> SimulationFixture:
             value=0,
             parts=MicroPart(
                 name="1 to outside diversion ratio",
-                dataset=inside_diversion_micro_dataset,
+                dataset=inside_diversion_micro_dataset0,
+                compute_values=lambda _, p, a: np.concatenate(
+                    [np.ones((a.size, p.size, 1)), np.zeros((a.size, p.size, p.size))], axis=2
+                ),
+            ),
+        ),
+        MicroMoment(
+            name="1 to 0 diversion ratio, grouped",
+            value=0,
+            parts=MicroPart(
+                name="1 to 0 diversion ratio, grouped",
+                dataset=inside_diversion_micro_dataset1,
+                compute_values=lambda _, p, a: np.concatenate(
+                    [np.zeros((a.size, p.size, 1)), np.tile(p.product_ids[:, 0] == 0, (a.size, p.size, 1))], axis=2
+                ),
+            ),
+        ),
+        MicroMoment(
+            name="1 to outside diversion ratio, grouped",
+            value=0,
+            parts=MicroPart(
+                name="1 to outside diversion ratio, grouped",
+                dataset=inside_diversion_micro_dataset1,
                 compute_values=lambda _, p, a: np.concatenate(
                     [np.ones((a.size, p.size, 1)), np.zeros((a.size, p.size, p.size))], axis=2
                 ),
@@ -639,7 +697,7 @@ def large_nested_blp_simulation() -> SimulationFixture:
             parts=MicroPart(
                 name="outside to 0 diversion ratio",
                 dataset=outside_diversion_micro_dataset,
-                compute_values=lambda _, p, a: np.tile(p.product_ids.flat == 0, (a.size, 1 + p.size, 1)),
+                compute_values=lambda _, p, a: np.tile(p.product_ids[:, 0] == 0, (a.size, 1 + p.size, 1)),
             ),
         ),
     ])