Skip to content

Commit

Permalink
Lazy date lut (#431)
Browse files Browse the repository at this point in the history
* Initial version with env variable & ranges

* Assuming we don't want to check ranges, can be simplified to use same lut for both classes

* Assuming additional 50ms for cache warm-up are neglectable, can always set lazy

* Tests & lints

* Add ability to control lazy lut & warm up with env variable
  • Loading branch information
DaniilAnichin authored May 9, 2024
1 parent c322fa6 commit e4d3fb5
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 15 deletions.
61 changes: 50 additions & 11 deletions clickhouse_driver/columns/datecolumn.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from os import getenv
from datetime import date, timedelta

from .base import FormatColumn
Expand All @@ -7,7 +8,53 @@
epoch_end = date(2149, 6, 6)

epoch_start_date32 = date(1900, 1, 1)
epoch_end_date32 = date(2283, 11, 11)
epoch_end_date32 = date(2299, 12, 31)


class LazyLUT(dict):
def __init__(self, *args, _factory, **kwargs):
super().__init__(*args, **kwargs)
self._default_factory = _factory

def __missing__(self, key):
return self.setdefault(key, self._default_factory(key))


def make_date_lut_range(date_start, date_end):
return range(
(date_start - epoch_start).days,
(date_end - epoch_start).days + 1,
)


enable_lazy_date_lut = getenv('CLICKHOUSE_DRIVER_LASY_DATE_LUT', False)
if enable_lazy_date_lut:
try:
start, end = enable_lazy_date_lut.split(':')
start_date = date.fromisoformat(start)
end_date = date.fromisoformat(end)

date_range = make_date_lut_range(start_date, end_date)
except ValueError:
date_range = ()

# Since we initialize lazy lut with some initially warmed values,
# we use iterator and not dict comprehension for memory & time optimization
_date_lut = LazyLUT(
((x, epoch_start + timedelta(days=x)) for x in date_range),
_factory=lambda x: epoch_start + timedelta(days=x),
)
_date_lut_reverse = LazyLUT(
((value, key) for key, value in _date_lut.items()),
_factory=lambda x: (x - epoch_start).days,
)
else:
# If lazy lut is not enabled, we fallback to static dict initialization
# In both cases, we use same lut for both data types,
# since one encompasses the other and we can avoid duplicating overlap
date_range = make_date_lut_range(epoch_start_date32, epoch_end_date32)
_date_lut = {x: epoch_start + timedelta(days=x) for x in date_range}
_date_lut_reverse = {value: key for key, value in _date_lut.items()}


class DateColumn(FormatColumn):
Expand All @@ -18,9 +65,8 @@ class DateColumn(FormatColumn):
min_value = epoch_start
max_value = epoch_end

date_lut_days = (epoch_end - epoch_start).days + 1
date_lut = {x: epoch_start + timedelta(x) for x in range(date_lut_days)}
date_lut_reverse = {value: key for key, value in date_lut.items()}
date_lut = _date_lut
date_lut_reverse = _date_lut_reverse

def before_write_items(self, items, nulls_map=None):
null_value = self.null_value
Expand Down Expand Up @@ -60,10 +106,3 @@ class Date32Column(DateColumn):

min_value = epoch_start_date32
max_value = epoch_end_date32

date_lut_days = (epoch_end_date32 - epoch_start).days + 1
date_lut = {
x: epoch_start + timedelta(x)
for x in range((epoch_start_date32 - epoch_start).days, date_lut_days)
}
date_lut_reverse = {value: key for key, value in date_lut.items()}
12 changes: 8 additions & 4 deletions tests/columns/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,26 @@ def test_wrong_date_insert(self):
data = [
(date(5555, 1, 1), ),
(date(1, 1, 1), ),
(date(2284, 1, 1), )
(date(2300, 1, 1), ),
(date(1899, 12, 31), )
]
self.client.execute('INSERT INTO test (a) VALUES', data)
query = 'SELECT * FROM test'
inserted = self.emit_cli(query)
self.assertEqual(inserted, '1970-01-01\n1970-01-01\n1970-01-01\n')
self.assertEqual(
inserted,
'1970-01-01\n1970-01-01\n1970-01-01\n1970-01-01\n',
)

@require_server_version(22, 8)
def test_boundaries_1900(self):
with self.create_table('a Date32'):
data = [(date(1900, 1, 1),)]
data = [(date(1900, 1, 1),), (date(2299, 12, 31), )]
self.client.execute('INSERT INTO test (a) VALUES', data)

query = 'SELECT * FROM test'
inserted = self.emit_cli(query)
self.assertEqual(inserted, '1900-01-01\n')
self.assertEqual(inserted, '1900-01-01\n2299-12-31\n')

inserted = self.client.execute(query)
self.assertEqual(inserted, data)
Expand Down

0 comments on commit e4d3fb5

Please sign in to comment.