From e4d3fb5c2109010bbe6b7e61b5f7222244744f83 Mon Sep 17 00:00:00 2001 From: Daniil Date: Thu, 9 May 2024 21:45:11 +0300 Subject: [PATCH] Lazy date lut (#431) * Initial version with env variable & ranges * Assuming we don't want to check ranges, can be simplified to use same lut for both classes * Assuming additional 50ms for cache warm-up are neglectable, can always set lazy * Tests & lints * Add ability to control lazy lut & warm up with env variable --- clickhouse_driver/columns/datecolumn.py | 61 ++++++++++++++++++++----- tests/columns/test_date.py | 12 +++-- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/clickhouse_driver/columns/datecolumn.py b/clickhouse_driver/columns/datecolumn.py index d258a89d..7e84f76e 100644 --- a/clickhouse_driver/columns/datecolumn.py +++ b/clickhouse_driver/columns/datecolumn.py @@ -1,3 +1,4 @@ +from os import getenv from datetime import date, timedelta from .base import FormatColumn @@ -7,7 +8,53 @@ epoch_end = date(2149, 6, 6) epoch_start_date32 = date(1900, 1, 1) -epoch_end_date32 = date(2283, 11, 11) +epoch_end_date32 = date(2299, 12, 31) + + +class LazyLUT(dict): + def __init__(self, *args, _factory, **kwargs): + super().__init__(*args, **kwargs) + self._default_factory = _factory + + def __missing__(self, key): + return self.setdefault(key, self._default_factory(key)) + + +def make_date_lut_range(date_start, date_end): + return range( + (date_start - epoch_start).days, + (date_end - epoch_start).days + 1, + ) + + +enable_lazy_date_lut = getenv('CLICKHOUSE_DRIVER_LASY_DATE_LUT', False) +if enable_lazy_date_lut: + try: + start, end = enable_lazy_date_lut.split(':') + start_date = date.fromisoformat(start) + end_date = date.fromisoformat(end) + + date_range = make_date_lut_range(start_date, end_date) + except ValueError: + date_range = () + + # Since we initialize lazy lut with some initially warmed values, + # we use iterator and not dict comprehension for memory & time optimization + _date_lut = LazyLUT( + ((x, epoch_start + timedelta(days=x)) for x in date_range), + _factory=lambda x: epoch_start + timedelta(days=x), + ) + _date_lut_reverse = LazyLUT( + ((value, key) for key, value in _date_lut.items()), + _factory=lambda x: (x - epoch_start).days, + ) +else: + # If lazy lut is not enabled, we fallback to static dict initialization + # In both cases, we use same lut for both data types, + # since one encompasses the other and we can avoid duplicating overlap + date_range = make_date_lut_range(epoch_start_date32, epoch_end_date32) + _date_lut = {x: epoch_start + timedelta(days=x) for x in date_range} + _date_lut_reverse = {value: key for key, value in _date_lut.items()} class DateColumn(FormatColumn): @@ -18,9 +65,8 @@ class DateColumn(FormatColumn): min_value = epoch_start max_value = epoch_end - date_lut_days = (epoch_end - epoch_start).days + 1 - date_lut = {x: epoch_start + timedelta(x) for x in range(date_lut_days)} - date_lut_reverse = {value: key for key, value in date_lut.items()} + date_lut = _date_lut + date_lut_reverse = _date_lut_reverse def before_write_items(self, items, nulls_map=None): null_value = self.null_value @@ -60,10 +106,3 @@ class Date32Column(DateColumn): min_value = epoch_start_date32 max_value = epoch_end_date32 - - date_lut_days = (epoch_end_date32 - epoch_start).days + 1 - date_lut = { - x: epoch_start + timedelta(x) - for x in range((epoch_start_date32 - epoch_start).days, date_lut_days) - } - date_lut_reverse = {value: key for key, value in date_lut.items()} diff --git a/tests/columns/test_date.py b/tests/columns/test_date.py index 0b8c9c03..71a06f80 100644 --- a/tests/columns/test_date.py +++ b/tests/columns/test_date.py @@ -84,22 +84,26 @@ def test_wrong_date_insert(self): data = [ (date(5555, 1, 1), ), (date(1, 1, 1), ), - (date(2284, 1, 1), ) + (date(2300, 1, 1), ), + (date(1899, 12, 31), ) ] self.client.execute('INSERT INTO test (a) VALUES', data) query = 'SELECT * FROM test' inserted = self.emit_cli(query) - self.assertEqual(inserted, '1970-01-01\n1970-01-01\n1970-01-01\n') + self.assertEqual( + inserted, + '1970-01-01\n1970-01-01\n1970-01-01\n1970-01-01\n', + ) @require_server_version(22, 8) def test_boundaries_1900(self): with self.create_table('a Date32'): - data = [(date(1900, 1, 1),)] + data = [(date(1900, 1, 1),), (date(2299, 12, 31), )] self.client.execute('INSERT INTO test (a) VALUES', data) query = 'SELECT * FROM test' inserted = self.emit_cli(query) - self.assertEqual(inserted, '1900-01-01\n') + self.assertEqual(inserted, '1900-01-01\n2299-12-31\n') inserted = self.client.execute(query) self.assertEqual(inserted, data)