Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix data freq y new #68

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
11 changes: 5 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,22 @@ name: Python package
on:
push:
branches:
- main
- main
pull_request:
branches:
- main
- main

jobs:

build:
name: Build for (${{ matrix.python-version }}, ${{ matrix.os }})
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
python-version: ['3.9', '3.10', '3.11']
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
python-version: ["3.10", "3.11"]
env:
MPLBACKEND: Agg # https://github.com/orgs/community/discussions/26434
MPLBACKEND: Agg # https://github.com/orgs/community/discussions/26434
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand Down
75 changes: 38 additions & 37 deletions .github/workflows/sonarcloud.yml
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
name: sonarcloud
# disable sonarcloud for now as it requires a token (perhaps paid).
# name: sonarcloud

on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
branches:
- main
# on:
# push:
# branches:
# - main
# pull_request:
# types: [opened, synchronize, reopened]
# branches:
# - main

jobs:
# jobs:

sonarcloud:
name: SonarCloud
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Python info
shell: bash -l {0}
run: |
which python3
python3 --version
- name: Install hatch
run: python3 -m pip install hatch
- name: Run unit tests with coverage
run: hatch run coverage
- name: Correct coverage paths
run: sed -i "s+$PWD/++g" coverage.xml
- name: SonarCloud Scan
uses: SonarSource/sonarcloud-github-action@master
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
SONAR_TOKEN: ${{secrets.SONAR_TOKEN }}
# sonarcloud:
# name: SonarCloud
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v3
# with:
# fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
# - name: Set up Python
# uses: actions/setup-python@v3
# with:
# python-version: "3.10"
# - name: Python info
# shell: bash -l {0}
# run: |
# which python3
# python3 --version
# - name: Install hatch
# run: python3 -m pip install hatch
# - name: Run unit tests with coverage
# run: hatch run coverage
# - name: Correct coverage paths
# run: sed -i "s+$PWD/++g" coverage.xml
# - name: SonarCloud Scan
# uses: SonarSource/sonarcloud-github-action@master
# env:
# GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
# SONAR_TOKEN: ${{secrets.SONAR_TOKEN }}
28 changes: 26 additions & 2 deletions lilio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@


MONTH_LENGTH = 30 # Month length for Timedelta checks.
YEAR_LENGTH = 365.25 # Year length for Timedelta checks.


def check_timeseries(
Expand Down Expand Up @@ -116,14 +117,24 @@ def infer_input_data_freq(
data_freq = (data.time.values[1:] - data.time.values[:-1]).min()

if isinstance(data_freq, str):
data_freq.replace("-", "") # Get the absolute frequency

if not re.match(r"\d+\D", data_freq): # infer_freq can return "d" for "1d".
data_freq = "1" + data_freq

# anoying switch from "2M" to "2ME" format in pandas > 2.2.
# We will need to adapt to this in the future.
if len(data_freq) in [3, 4] and data_freq[1:] in ["ME", "MS"]:
data_freq = data_freq.replace(data_freq[1:], "M")

data_freq = ( # Deal with monthly timedelta case
replace_month_length(data_freq) if data_freq[-1] == "M" else data_freq
)

data_freq = ( # Deal with yearly timedelta case
replace_year_length(data_freq)
if "A" in data_freq or "Y" in data_freq
else data_freq
)

return pd.Timedelta(data_freq)


Expand All @@ -133,12 +144,19 @@ def replace_month_length(length: str) -> str:
return f"{ndays}d"


def replace_year_length(length: str) -> str:
"""Replace year lengths with an equivalent length in days."""
ndays = YEAR_LENGTH
return f"{ndays}d"


def get_smallest_calendar_freq(calendar: "Calendar") -> pd.Timedelta:
"""Return the smallest length of the calendar's intervals as a Timedelta."""
intervals = calendar.targets + calendar.precursors
lengthstr = [iv.length for iv in intervals]
lengthstr = [ln.replace("-", "") for ln in lengthstr] # Account for neg. lengths
lengthstr = [replace_month_length(ln) if ln[-1] == "M" else ln for ln in lengthstr]
lengthstr = [replace_year_length(ln) if "Y" in ln else ln for ln in lengthstr]
lengths = [pd.Timedelta(ln) for ln in lengthstr]
return min(lengths)

Expand All @@ -155,6 +173,10 @@ def check_input_frequency(
data_freq = infer_input_data_freq(data)
calendar_freq = get_smallest_calendar_freq(calendar)

if data_freq == pd.Timedelta("365.25d") and calendar_freq == pd.Timedelta("1d"):
# Allow yearly (one-datapoint-per-year) data to be resampled to daily data.
return None

if calendar_freq < data_freq:
raise ValueError(
"The data is of a lower time resolution than the calendar. "
Expand Down Expand Up @@ -345,6 +367,8 @@ def parse_freqstr_to_dateoffset(time_str):
time_dict = {"months": int(time_str[:-1])}
elif re.fullmatch(r"[+-]?\d*W", time_str):
time_dict = {"weeks": int(time_str[:-1])}
elif re.fullmatch(r"[+-]?\d*Y", time_str):
time_dict = {"years": int(time_str[:-1])}
else:
raise ValueError("Please input a time string in the correct format.")

Expand Down
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ name = "lilio"
description = "python package for generating calendars for machine learning timeseries analysis."
readme = "README.md"
license = "Apache-2.0"
requires-python = ">=3.9, <3.12"
requires-python = ">=3.10, <3.12"
authors = [
{email = "[email protected]"},
{name = "Yang Liu, Bart Schilperoort, Peter Kalverla, Jannes van Ingen, Sem Vijverberg"}
Expand All @@ -49,14 +49,13 @@ classifiers = [
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
]
dependencies = [
"netcdf4",
"numpy",
"pandas",
"pandas < 2.2",
"matplotlib",
"xarray",
"scikit-learn",
Expand Down
5 changes: 5 additions & 0 deletions tests/test_calendar.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ def test_interval_weeks(self):
assert target.length_dateoffset == DateOffset(weeks=3)
assert target.gap_dateoffset == DateOffset(weeks=2)

def test_interval_years(self):
target = Interval("target", "1Y", "1Y")
assert target.length_dateoffset == DateOffset(years=1)
assert target.gap_dateoffset == DateOffset(years=1)

def test_target_interval_dict(self):
a = {"months": 1, "weeks": 2, "days": 1}
b = {"months": 2, "weeks": 1, "days": 5}
Expand Down
48 changes: 48 additions & 0 deletions tests/test_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ def dummy_multidimensional(self):
},
)

@pytest.fixture
def dummy_calendar_with_year_freq(self):
cal = Calendar(anchor="Jan")
cal.add_intervals("target", length="1Y")
return cal

# Tests start here:
def test_non_mapped_calendar(self, dummy_calendar):
with pytest.raises(ValueError):
Expand Down Expand Up @@ -253,6 +259,48 @@ def test_dataarray_attrs(self, dummy_calendar, dummy_dataarray):
for att in expected_attrs:
assert att in resampled.attrs.keys()

def test_resample_with_year_freq(
self,
dummy_calendar_with_year_freq,
):
"""Testing resampling when you have only 1 datapoint per year."""
years = list(range(2019, 2022))
time_index = pd.to_datetime([f"{year}-02-01" for year in years])
test_data = np.random.random(len(time_index))
initseries = pd.Series(test_data, index=time_index, name="data1")
# The calendar will skip the last timestep because of how pd.intervals are
# defined (with left and right bounds). This is not a problem for resampling,
# but it is a problem for the user to be aware of.
series = initseries._append(
pd.Series([np.nan], index=[pd.to_datetime("2022-02-01")])
)
cal = dummy_calendar_with_year_freq
cal.map_to_data(series)
cal.get_intervals()
resampled = resample(cal, series)
assert all(np.equal(test_data, resampled.data.values)), "Data not equal."

def test_resample_with_one_datapoint_per_year(
self,
):
"""Testing resampling when you have only 1 datapoint per year."""
years = list(range(2019, 2022))
time_index = pd.to_datetime([f"{year}-02-01" for year in years])
test_data = np.random.random(len(time_index))
initseries = pd.Series(test_data, index=time_index, name="data1")
# The calendar will skip the last timestep because of how pd.intervals are
# defined (with left and right bounds). This is not a problem for resampling,
# but it is a problem for the user to be aware of.
series = initseries._append(
pd.Series([np.nan], index=[pd.to_datetime("2022-02-01")])
)
cal = Calendar(anchor="02-01")
cal.add_intervals("target", length="1d")
cal.map_to_data(series)
cal.get_intervals()
resampled = resample(cal, series)
assert all(np.equal(test_data, resampled.data.values)), "Data not equal."


TOO_LOW_FREQ_ERR = r".*lower time resolution than the calendar.*"
TOO_LOW_FREQ_WARN = r".*input data frequency is very close to the Calendar's freq.*"
Expand Down
12 changes: 12 additions & 0 deletions tests/test_traintest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for Lilio's traintest module."""
import re
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -79,6 +80,17 @@ def test_kfold_xxy(dummy_data):
xr.testing.assert_equal(y_test, y.sel(anchor_year=expected_test))


def test_kfold_xxy_args(dummy_data):
"""Correctly split x1, x2, and y."""
x1, x2, y = dummy_data
cv = lilio.traintest.TrainTestSplit(KFold(n_splits=3))
with pytest.raises(
TypeError,
match=re.escape("TrainTestSplit.split() got multiple values for argument 'y'"),
):
next(cv.split(*[x1, x2], y=y))


def test_kfold_xxy_tuple(dummy_data):
"""Correctly split x1, x2, and y."""
x1, x2, y = dummy_data
Expand Down
Loading