Skip to content

Commit

Permalink
test for string timezone
Browse files Browse the repository at this point in the history
not sure it works as expected
  • Loading branch information
polinaeterna committed Jan 9, 2025
1 parent c68efb7 commit 351ef5c
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 20 deletions.
56 changes: 42 additions & 14 deletions services/worker/tests/fixtures/statistics_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1703,19 +1703,6 @@ def null_column(n_samples: int) -> list[None]:

datetime_dataset = Dataset.from_dict(
{
"datetime": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
"datetime_string": [
"2024-01-01 00:00:00",
"2024-01-02 00:00:00",
Expand All @@ -1742,6 +1729,45 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00Z",
"2024-01-11 00:00:00Z",
],
"datetime_string_cet": [
"2024-01-01 00:00:00CET",
"2024-01-02 00:00:00CET",
"2024-01-03 00:00:00CET",
"2024-01-04 00:00:00CET",
"2024-01-05 00:00:00CET",
"2024-01-06 00:00:00CET",
"2024-01-07 00:00:00CET",
"2024-01-08 00:00:00CET",
"2024-01-09 00:00:00CET",
"2024-01-10 00:00:00CET",
"2024-01-11 00:00:00CET",
],
"datetime_string_tz": [
"2024-01-01 00:00:00+0200",
"2024-01-02 00:00:00+0200",
"2024-01-03 00:00:00+0200",
"2024-01-04 00:00:00+0200",
"2024-01-05 00:00:00+0200",
"2024-01-06 00:00:00+0200",
"2024-01-07 00:00:00+0200",
"2024-01-08 00:00:00+0200",
"2024-01-09 00:00:00+0200",
"2024-01-10 00:00:00+0200",
"2024-01-11 00:00:00+0200",
],
"datetime": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
"datetime_tz": [
datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
Expand Down Expand Up @@ -1772,9 +1798,11 @@ def null_column(n_samples: int) -> list[None]:
},
features=Features(
{
"datetime": Value("timestamp[s]"),
"datetime_string": Value("string"),
"datetime_string_z": Value("string"),
"datetime_string_cet": Value("string"),
"datetime_string_tz": Value("string"),
"datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
Expand Down
38 changes: 32 additions & 6 deletions services/worker/tests/test_statistics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,17 +510,34 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
if column_name == "datetime_tz":
bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges]
minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200"
elif column_name == "datetime_string_tz":
# switch everything to two hours earlier in UTC timezone
minv = "2023-12-31 22:00:00+0000"
maxv = "2024-01-10 22:00:00+0000"
mean = "2024-01-05 22:00:00+0000"
median = "2024-01-05 22:00:00+0000"
bin_edges = [
"2023-12-31 22:00:00+0000",
"2024-01-01 22:00:01+0000",
"2024-01-02 22:00:02+0000",
"2024-01-03 22:00:03+0000",
"2024-01-04 22:00:04+0000",
"2024-01-05 22:00:05+0000",
"2024-01-06 22:00:06+0000",
"2024-01-07 22:00:07+0000",
"2024-01-08 22:00:08+0000",
"2024-01-09 22:00:09+0000",
"2024-01-10 22:00:00+0000",
]

# compute std
seconds_in_day = 24 * 60 * 60
if column_name in ["datetime", "datetime_string", "datetime_string_z", "datetime_tz"]:
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
elif column_name == "datetime_null":
if column_name == "datetime_null":
timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every other day
hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
else:
raise ValueError("Incorrect column")
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]

std = timedeltas.std()
std_str = str(datetime.timedelta(seconds=std))
Expand All @@ -542,7 +559,16 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name

@pytest.mark.parametrize(
"column_name",
["datetime", "datetime_string", "datetime_string_z", "datetime_tz", "datetime_null", "datetime_all_null"],
[
"datetime",
"datetime_string",
"datetime_string_z",
"datetime_string_cet",
"datetime_string_tz",
"datetime_tz",
"datetime_null",
"datetime_all_null",
],
)
def test_datetime_statistics(
column_name: str,
Expand Down

0 comments on commit 351ef5c

Please sign in to comment.