AI4S2S · semvijverberg · Dec 20, 2023 · Dec 20, 2023 · Jan 12, 2024 · Jan 16, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -3,23 +3,22 @@ name: Python package
 on:
   push:
     branches:
-    - main
+      - main
   pull_request:
     branches:
-    - main
+      - main
 
 jobs:
-
   build:
     name: Build for (${{ matrix.python-version }}, ${{ matrix.os }})
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
-        python-version: ['3.9', '3.10', '3.11']
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        python-version: ["3.10", "3.11"]
     env:
-      MPLBACKEND: Agg  # https://github.com/orgs/community/discussions/26434
+      MPLBACKEND: Agg # https://github.com/orgs/community/discussions/26434
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml
@@ -1,40 +1,41 @@
-name: sonarcloud
+# disable sonarcloud for now as it requires a token (perhaps paid).
+# name: sonarcloud
 
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-    - main
+# on:
+#   push:
+#     branches:
+#     - main
+#   pull_request:
+#     types: [opened, synchronize, reopened]
+#     branches:
+#     - main
 
-jobs:
+# jobs:
 
-  sonarcloud:
-    name: SonarCloud
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
-      - name: Set up Python
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-      - name: Python info
-        shell: bash -l {0}
-        run: |
-          which python3
-          python3 --version
-      - name: Install hatch
-        run: python3 -m pip install hatch
-      - name: Run unit tests with coverage
-        run: hatch run coverage
-      - name: Correct coverage paths
-        run: sed -i "s+$PWD/++g" coverage.xml
-      - name: SonarCloud Scan
-        uses: SonarSource/sonarcloud-github-action@master
-        env:
-          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
-          SONAR_TOKEN: ${{secrets.SONAR_TOKEN }}
+#   sonarcloud:
+#     name: SonarCloud
+#     runs-on: ubuntu-latest
+#     steps:
+#       - uses: actions/checkout@v3
+#         with:
+#           fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+#       - name: Set up Python
+#         uses: actions/setup-python@v3
+#         with:
+#           python-version: "3.10"
+#       - name: Python info
+#         shell: bash -l {0}
+#         run: |
+#           which python3
+#           python3 --version
+#       - name: Install hatch
+#         run: python3 -m pip install hatch
+#       - name: Run unit tests with coverage
+#         run: hatch run coverage
+#       - name: Correct coverage paths
+#         run: sed -i "s+$PWD/++g" coverage.xml
+#       - name: SonarCloud Scan
+#         uses: SonarSource/sonarcloud-github-action@master
+#         env:
+#           GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+#           SONAR_TOKEN: ${{secrets.SONAR_TOKEN }}
diff --git a/lilio/utils.py b/lilio/utils.py
@@ -13,6 +13,7 @@
 
 
 MONTH_LENGTH = 30  # Month length for Timedelta checks.
+YEAR_LENGTH = 365.25  # Year length for Timedelta checks.
 
 
 def check_timeseries(
@@ -116,14 +117,24 @@ def infer_input_data_freq(
             data_freq = (data.time.values[1:] - data.time.values[:-1]).min()
 
     if isinstance(data_freq, str):
-        data_freq.replace("-", "")  # Get the absolute frequency
-
         if not re.match(r"\d+\D", data_freq):  # infer_freq can return "d" for "1d".
             data_freq = "1" + data_freq
 
+        # anoying switch from "2M" to "2ME" format in pandas > 2.2.
+        # We will need to adapt to this in the future.
+        if len(data_freq) in [3, 4] and data_freq[1:] in ["ME", "MS"]:
+            data_freq = data_freq.replace(data_freq[1:], "M")
+
         data_freq = (  # Deal with monthly timedelta case
             replace_month_length(data_freq) if data_freq[-1] == "M" else data_freq
         )
+
+        data_freq = (  # Deal with yearly timedelta case
+            replace_year_length(data_freq)
+            if "A" in data_freq or "Y" in data_freq
+            else data_freq
+        )
+
     return pd.Timedelta(data_freq)
 
 
@@ -133,12 +144,19 @@ def replace_month_length(length: str) -> str:
     return f"{ndays}d"
 
 
+def replace_year_length(length: str) -> str:
+    """Replace year lengths with an equivalent length in days."""
+    ndays = YEAR_LENGTH
+    return f"{ndays}d"
+
+
 def get_smallest_calendar_freq(calendar: "Calendar") -> pd.Timedelta:
     """Return the smallest length of the calendar's intervals as a Timedelta."""
     intervals = calendar.targets + calendar.precursors
     lengthstr = [iv.length for iv in intervals]
     lengthstr = [ln.replace("-", "") for ln in lengthstr]  # Account for neg. lengths
     lengthstr = [replace_month_length(ln) if ln[-1] == "M" else ln for ln in lengthstr]
+    lengthstr = [replace_year_length(ln) if "Y" in ln else ln for ln in lengthstr]
     lengths = [pd.Timedelta(ln) for ln in lengthstr]
     return min(lengths)
 
@@ -155,6 +173,10 @@ def check_input_frequency(
     data_freq = infer_input_data_freq(data)
     calendar_freq = get_smallest_calendar_freq(calendar)
 
+    if data_freq == pd.Timedelta("365.25d") and calendar_freq == pd.Timedelta("1d"):
+        # Allow yearly (one-datapoint-per-year) data to be resampled to daily data.
+        return None
+
     if calendar_freq < data_freq:
         raise ValueError(
             "The data is of a lower time resolution than the calendar. "
@@ -345,6 +367,8 @@ def parse_freqstr_to_dateoffset(time_str):
         time_dict = {"months": int(time_str[:-1])}
     elif re.fullmatch(r"[+-]?\d*W", time_str):
         time_dict = {"weeks": int(time_str[:-1])}
+    elif re.fullmatch(r"[+-]?\d*Y", time_str):
+        time_dict = {"years": int(time_str[:-1])}
     else:
         raise ValueError("Please input a time string in the correct format.")
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ name = "lilio"
 description = "python package for generating calendars for machine learning timeseries analysis."
 readme = "README.md"
 license = "Apache-2.0"
-requires-python = ">=3.9, <3.12"
+requires-python = ">=3.10, <3.12"
 authors = [
   {email = "[email protected]"},
   {name = "Yang Liu, Bart Schilperoort, Peter Kalverla, Jannes van Ingen, Sem Vijverberg"}
@@ -49,14 +49,13 @@ classifiers = [
   "Operating System :: OS Independent",
   "Programming Language :: Python",
   "Programming Language :: Python :: 3 :: Only",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
   "netcdf4",
   "numpy",
-  "pandas",
+  "pandas < 2.2",
   "matplotlib",
   "xarray",
   "scikit-learn",

diff --git a/tests/test_calendar.py b/tests/test_calendar.py
@@ -43,6 +43,11 @@ def test_interval_weeks(self):
         assert target.length_dateoffset == DateOffset(weeks=3)
         assert target.gap_dateoffset == DateOffset(weeks=2)
 
+    def test_interval_years(self):
+        target = Interval("target", "1Y", "1Y")
+        assert target.length_dateoffset == DateOffset(years=1)
+        assert target.gap_dateoffset == DateOffset(years=1)
+
     def test_target_interval_dict(self):
         a = {"months": 1, "weeks": 2, "days": 1}
         b = {"months": 2, "weeks": 1, "days": 5}

diff --git a/tests/test_resample.py b/tests/test_resample.py
@@ -72,6 +72,12 @@ def dummy_multidimensional(self):
             },
         )
 
+    @pytest.fixture
+    def dummy_calendar_with_year_freq(self):
+        cal = Calendar(anchor="Jan")
+        cal.add_intervals("target", length="1Y")
+        return cal
+
     # Tests start here:
     def test_non_mapped_calendar(self, dummy_calendar):
         with pytest.raises(ValueError):
@@ -253,6 +259,48 @@ def test_dataarray_attrs(self, dummy_calendar, dummy_dataarray):
         for att in expected_attrs:
             assert att in resampled.attrs.keys()
 
+    def test_resample_with_year_freq(
+        self,
+        dummy_calendar_with_year_freq,
+    ):
+        """Testing resampling when you have only 1 datapoint per year."""
+        years = list(range(2019, 2022))
+        time_index = pd.to_datetime([f"{year}-02-01" for year in years])
+        test_data = np.random.random(len(time_index))
+        initseries = pd.Series(test_data, index=time_index, name="data1")
+        # The calendar will skip the last timestep because of how pd.intervals are
+        # defined (with left and right bounds). This is not a problem for resampling,
+        # but it is a problem for the user to be aware of.
+        series = initseries._append(
+            pd.Series([np.nan], index=[pd.to_datetime("2022-02-01")])
+        )
+        cal = dummy_calendar_with_year_freq
+        cal.map_to_data(series)
+        cal.get_intervals()
+        resampled = resample(cal, series)
+        assert all(np.equal(test_data, resampled.data.values)), "Data not equal."
+
+    def test_resample_with_one_datapoint_per_year(
+        self,
+    ):
+        """Testing resampling when you have only 1 datapoint per year."""
+        years = list(range(2019, 2022))
+        time_index = pd.to_datetime([f"{year}-02-01" for year in years])
+        test_data = np.random.random(len(time_index))
+        initseries = pd.Series(test_data, index=time_index, name="data1")
+        # The calendar will skip the last timestep because of how pd.intervals are
+        # defined (with left and right bounds). This is not a problem for resampling,
+        # but it is a problem for the user to be aware of.
+        series = initseries._append(
+            pd.Series([np.nan], index=[pd.to_datetime("2022-02-01")])
+        )
+        cal = Calendar(anchor="02-01")
+        cal.add_intervals("target", length="1d")
+        cal.map_to_data(series)
+        cal.get_intervals()
+        resampled = resample(cal, series)
+        assert all(np.equal(test_data, resampled.data.values)), "Data not equal."
+
 
 TOO_LOW_FREQ_ERR = r".*lower time resolution than the calendar.*"
 TOO_LOW_FREQ_WARN = r".*input data frequency is very close to the Calendar's freq.*"

diff --git a/tests/test_traintest.py b/tests/test_traintest.py
@@ -1,4 +1,5 @@
 """Tests for Lilio's traintest module."""
+import re
 import numpy as np
 import pandas as pd
 import pytest
@@ -79,6 +80,17 @@ def test_kfold_xxy(dummy_data):
     xr.testing.assert_equal(y_test, y.sel(anchor_year=expected_test))
 
 
+def test_kfold_xxy_args(dummy_data):
+    """Correctly split x1, x2, and y."""
+    x1, x2, y = dummy_data
+    cv = lilio.traintest.TrainTestSplit(KFold(n_splits=3))
+    with pytest.raises(
+        TypeError,
+        match=re.escape("TrainTestSplit.split() got multiple values for argument 'y'"),
+    ):
+        next(cv.split(*[x1, x2], y=y))
+
+
 def test_kfold_xxy_tuple(dummy_data):
     """Correctly split x1, x2, and y."""
     x1, x2, y = dummy_data