From 5da9b72a451d8ab765dd5ba31494feddadddf0a2 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Tue, 26 Mar 2024 17:47:47 +0100
Subject: [PATCH] use income column in prepare persons

---
 matsim/scenariogen/data/formats/srv.py |   2 +-
 matsim/scenariogen/data/preparation.py | 104 +++++++++++++++++++++++--
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/matsim/scenariogen/data/formats/srv.py b/matsim/scenariogen/data/formats/srv.py
index 42dde72..8151049 100644
--- a/matsim/scenariogen/data/formats/srv.py
+++ b/matsim/scenariogen/data/formats/srv.py
@@ -446,7 +446,7 @@ def parse_zone(h, prefix=""):
         ob = parse_int_str(getattr(h, prefix + "OBERBEZIRK"))
 
         zone = pd.NA
-        if ob:
+        if ob and hasattr(h, prefix + "UNTERBEZIRK"):
             zone = ob
             ub = parse_int_str(getattr(h, prefix + "UNTERBEZIRK"))
             if ub:
diff --git a/matsim/scenariogen/data/preparation.py b/matsim/scenariogen/data/preparation.py
index ebb118d..2029a75 100644
--- a/matsim/scenariogen/data/preparation.py
+++ b/matsim/scenariogen/data/preparation.py
@@ -15,6 +15,15 @@ def prepare_persons(hh, pp, tt, augment=5, max_hh_size=5, core_weekday=False, re
     """ Cleans common data errors and fill missing values """
     df = pp.join(hh, on="hh_id", lsuffix="hh_")
 
+    # Replace unknown income group
+    fill(df, "income", -1)
+
+    # Replace unknown economic status
+    df["economic_status"] = df.apply(
+        lambda x: income_to_economic_status(x.income, df[df.hh_id == x.hh_id])
+        if x.economic_status == EconomicStatus.UNKNOWN else x.economic_status, axis=1
+    )
+
     # Augment data using p_weight
     if augment > 1:
         df = augment_persons(df, augment)
@@ -31,9 +40,6 @@ def prepare_persons(hh, pp, tt, augment=5, max_hh_size=5, core_weekday=False, re
     df.loc[df.age < 6, "pt_abo_avail"] = Availability.NO
     fill(df, "pt_abo_avail", Availability.UNKNOWN)
 
-    # Replace unknown income group
-    fill(df, "economic_status", EconomicStatus.UNKNOWN)
-
     # Large households are underrepresented and capped
     df.n_persons = np.minimum(df.n_persons, max_hh_size)
 
@@ -64,19 +70,22 @@ def prepare_persons(hh, pp, tt, augment=5, max_hh_size=5, core_weekday=False, re
 
     return df
 
+
 def bins_to_labels(bins):
     """ Convert bins to labels """
-    res =  ["%.0f - %.0f" % (bins[i], bins[i + 1]) for i in range(len(bins) - 1)]
+    res = ["%.0f - %.0f" % (bins[i], bins[i + 1]) for i in range(len(bins) - 1)]
 
     if bins[-1] == np.inf:
         res[-1] = "%.0f+" % bins[-2]
 
     return res
 
+
 def cut(x, bins):
     """ Cut x into bind and return labels """
     return pd.cut(x, bins, labels=bins_to_labels(bins), right=False)
 
+
 def augment_persons(pp, factor=1, permute_age=0.5):
     """ Augment persons using p weight
 
@@ -101,6 +110,88 @@ def augment_persons(pp, factor=1, permute_age=0.5):
     return duplicated[check_age_employment(None, duplicated)]
 
 
+def income_to_economic_status(income, persons):
+    """ Convert income to economic status
+
+     :param income: income in Euro
+     :param persons: persons table
+     """
+
+    if income < 0:
+        return EconomicStatus.UNKNOWN
+
+    # Calculated according to Srv 2018
+    # https://tu-dresden.de/bu/verkehr/ivs/srv/ressourcen/dateien/SrV2018_Tabellenbericht_Oberzentren_500TEW-_flach.pdf?lang=de
+
+    children = (persons.age < 14).sum()
+    rest = len(persons) - children - 1
+
+    w = 0.3 * children + 1 + 0.5 * rest
+
+    if income < 1500:
+        if w < 1.3:
+            return EconomicStatus.LOW
+
+        return EconomicStatus.VERY_LOW
+
+    elif income < 2000:
+        if w < 1.3:
+            return EconomicStatus.MEDIUM
+        elif w < 1.6:
+            return EconomicStatus.LOW
+
+        return EconomicStatus.VERY_LOW
+
+    elif income < 2600:
+        if w < 1.6:
+            return EconomicStatus.MEDIUM
+        elif w < 2.3:
+            return EconomicStatus.LOW
+
+        return EconomicStatus.VERY_LOW
+
+    elif income < 3000:
+        if w < 1.3:
+            return EconomicStatus.HIGH
+        elif w < 2.3:
+            return EconomicStatus.MEDIUM
+        elif w < 3.0:
+            return EconomicStatus.LOW
+
+        return EconomicStatus.VERY_LOW
+
+    elif income < 3600:
+        if w < 1.6:
+            return EconomicStatus.HIGH
+        elif w < 2.3:
+            return EconomicStatus.MEDIUM
+        elif w < 3.5:
+            return EconomicStatus.LOW
+
+        return EconomicStatus.VERY_LOW
+
+    elif income < 4600:
+        if w < 2.1:
+            return EconomicStatus.HIGH
+        elif w < 3.0:
+            return EconomicStatus.MEDIUM
+
+        return EconomicStatus.LOW
+
+    elif income < 5600:
+        if w < 1.3:
+            return EconomicStatus.VERY_HIGH
+        if w < 2.8:
+            return EconomicStatus.HIGH
+        return EconomicStatus.MEDIUM
+
+    else:
+        if w < 2.5:
+            return EconomicStatus.VERY_HIGH
+
+        return EconomicStatus.HIGH
+
+
 def prepare_trips(pp, trips, core_weekday=True):
     """ Create trip data frame """
 
@@ -286,7 +377,8 @@ def calc_commute(pp, tt):
         edu.groupby("p_id").agg(commute_dist=("gis_length", "mean"), weight=("t_weight", "max"))
 
 
-def calc_needed_short_distance_trips(ref_trips: pd.DataFrame, sim_trips: pd.DataFrame, max_dist=1000) -> Tuple[float, int]:
+def calc_needed_short_distance_trips(ref_trips: pd.DataFrame, sim_trips: pd.DataFrame, max_dist=1000) -> Tuple[
+    float, int]:
     """ Calculate number of short distance trips needed to add to match required share """
 
     target_share = float(ref_trips[ref_trips.gis_length < (max_dist / 1000)].t_weight.sum() / ref_trips.t_weight.sum())
@@ -296,4 +388,4 @@ def calc_needed_short_distance_trips(ref_trips: pd.DataFrame, sim_trips: pd.Data
     current_share = len(short_trips) / len(sim_trips)
     num_trips = (len(short_trips) - len(sim_trips) * target_share) / (target_share - 1)
 
-    return target_share, num_trips
\ No newline at end of file
+    return target_share, num_trips