Global code cleanup. Also minor refactoring, improved some tests and …

…minor fixes.
sarusso · Nov 15, 2024 · 97446e4 · 97446e4
1 parent dad9d15
commit 97446e4
Show file tree

Hide file tree

Showing 24 changed files with 522 additions and 775 deletions.
diff --git a/jupyter.sh b/jupyter.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-# This script will build the Timeseria container and start it with Jupyter.
+# This script will build the Timeseria container and start it with Jupyter
 
 # Build
 if [[ "x$BUILD" != "xFalse" ]]; then

diff --git a/pypi.sh b/pypi.sh
diff --git a/requirements_pinned.txt b/requirements_pinned.txt
@@ -15,5 +15,5 @@ fitter==1.7.0
 # Optional
 #tensorflow==2.7.0   # TensorFlow (also -gpu, -macos or -aarch64 variants)
 #prophet==1.1.5      # Facebook's Prophet
-#pmdarima==2.0.4       # AARIMA
+#pmdarima==2.0.4     # AARIMA
 #statsmodels==0.14.1 # ARIMA, SARIMAX
diff --git a/test.sh b/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-# This script will build the Timeseria container and then run the tests into it.
+# This script will build the Timeseria container and run the tests into it
 
 # Build
 if [[ "x$BUILD" != "xFalse" ]]; then

diff --git a/timeseria/datastructures.py b/timeseria/datastructures.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Base data structures as Points, Slots and Series."""
+"""Base data structures as Points, Slots, and Series."""
 
 import json
 from copy import deepcopy
@@ -95,7 +95,7 @@ class TimePoint(Point):
 
     def __init__(self, *args, **kwargs):
 
-        # Handle timezone if any (removing it from kwargs)
+        # Handle time zone if any (removing it from kwargs)
         tz = kwargs.pop('tz', None)
         if tz:
             self._tz = timezonize(tz)
@@ -112,21 +112,17 @@ def __init__(self, *args, **kwargs):
                 # Ok, will convert the datetime to epoch and then create the point in the standard way
                 t = s_from_dt(kwargs['dt'])
 
-                # If we do not have a timezone, can we use the one from the dt used to initialize this TimePoint?
+                # If we do not have a time zone, can we use the one from the dt used to initialize this TimePoint?
                 try:
                     self._tz
                 except AttributeError:
                     if kwargs['dt'].tzinfo:
 
-                        #Do not set it if it is UTC, it is the default
+                        # Do not set it if it is UTC, it is the default
                         if kwargs['dt'].tzinfo == UTC:
                             pass
                         else:
                             self._tz = kwargs['dt'].tzinfo
-                            #raise NotImplementedError('Not yet tz from dt ("{}")'.format(kwargs['dt']))
-
-            #else:
-            #    raise Exception('Don\'t know how to handle all kwargs (got "{}")'.format(kwargs))
 
         # Cast or create in the standard way
         elif args:
@@ -155,7 +151,7 @@ def __gt__(self, other):
 
     @property
     def tz(self):
-        """The timezone."""
+        """The time zone."""
         try:
             return self._tz
         except AttributeError:
@@ -202,9 +198,6 @@ def __init__(self, *args, **kwargs):
                     raise ValueError('Got type "{}" for data_indexes, was expecitng a dict'.format(data_indexes.__class__.__name__))
         except KeyError:
             data_indexes = {}
-        #else:
-        #    if None in data_indexes.values():
-        #        raise ValueError('Cannot have an index set to None: do not set it at all ({})'.format(data_indexes))
 
         # Special data loss index
         try:
@@ -237,9 +230,9 @@ def __eq__(self, other):
     @property
     def data(self):
         """The data."""
-        # Data is set like this as it cannot be set if not in the init (read: changed after created)
-        # to prevent this to happend when the point is in a series where they are all supposed
-        # to carry the same data type and with the same number of elements. TODO: check!
+        # Data is implemented using a property to enforce that it cannot be changed after being set
+        # via the init, in particular with respect to the series, where data points are checked, upon
+        # insertion, to carry the same data type and with the same number of elements.
         return self._data
 
     @property
@@ -257,7 +250,6 @@ def data_loss(self):
         try:
             return self.data_indexes['data_loss']
         except KeyError:
-            #raise AttributeError('No data loss index set for this point')
             return None
 
     def data_labels(self):
@@ -297,7 +289,6 @@ def __repr__(self):
             return 'Time point @ {} ({}) with data "{}"'.format(self.t, self.dt, self.data)
 
 
-
 #======================
 #  Slots
 #======================
@@ -420,15 +411,11 @@ def __init__(self, start=None, end=None, unit=None, **kwargs):
         # Extra time zone checks
         if start and end:
             if start.tz != end.tz:
-                raise ValueError('{} start and end must have the same timezone (got start.tz="{}", end.tz="{}")'.format(self.__class__.__name__, start.tz, end.tz))
+                raise ValueError('{} start and end must have the same time zone (got start.tz="{}", end.tz="{}")'.format(self.__class__.__name__, start.tz, end.tz))
 
         # Call parent init
         super(TimeSlot, self).__init__(start=start, end=end, unit=unit)
 
-        # If we did not have the end, set its timezone now:
-        #if end is None:
-        #    self.end.change_tz(self.start.tz)
-
         # Store time zone
         self.tz = start.tz
 
@@ -504,9 +491,6 @@ def __init__(self, *args, **kwargs):
                     raise ValueError('Got type "{}" for data_indexes, was expecitng a dict'.format(data_indexes.__class__.__name__))
         except KeyError:
             data_indexes = {}
-        #else:
-        #    if None in data_indexes.values():
-        #        raise ValueError('Cannot have an index set to None: do not set it at all ({})'.format(data_indexes))
 
         # Special data loss index
         try:
@@ -536,9 +520,9 @@ def __eq__(self, other):
     @property
     def data(self):
         """The data."""
-        # Data is set like this as it cannot be set if not in the init (read: changed after created)
-        # to prevent this to happened when the point is in a series where they are all supposed
-        # to carry the same data type and with the same number of elements. TODO: check me!
+        # Data is implemented using a property to enforce that it cannot be changed after being set
+        # via the init, in particular with respect to the series, where data slots are checked, upon
+        # insertion, to carry the same data type and with the same number of elements.
         return self._data
 
     @property
@@ -556,7 +540,6 @@ def data_loss(self):
         try:
             return self.data_indexes['data_loss']
         except KeyError:
-            #raise AttributeError('No data loss index set for this point')
             return None
 
     def data_labels(self):
@@ -643,8 +626,6 @@ def append(self, item):
         """Append an item to the series. Accepts only items of the same
         type of the items already present in the series (unless empty)"""
 
-        # TODO: move to use the insert?
-
         # Check type
         if self.item_type:
             if not isinstance(item, self.item_type):
@@ -714,7 +695,6 @@ def _all_data_indexes(self):
         """Return all the data_indexes of the series, to be intended as custom
         defined indicators (i.e. data_loss, anomaly_index, etc.)."""
 
-        # TODO: move this to the Data*Series...?
         data_index_names = []
         for item in self:
             for index_name in item.data_indexes:
@@ -874,7 +854,6 @@ def __getitem__(self, key):
             # Try filtering on this data label only
             return self.filter(key)
         else:
-            # TOOD: this will not work for SeriesView if ever implemented
             return super(Series, self).__getitem__(key)
 
 
@@ -898,15 +877,13 @@ def rename_data_label(self, old_data_label, new_data_label):
         if len(self) > 0 and not self._item_data_reference:
             raise TypeError('Series items have no data, cannot rename a label')
         for item in self:
-            # TODO: move to the DataPoint/DataSlot?
             item.data[new_data_label] = item.data.pop(old_data_label)
 
     def remove_data_label(self, data_label):
         """Remove a data label, in-place."""
         if len(self) > 0 and not self._item_data_reference:
             raise TypeError('Series items have no data, cannot rename a label')
         for item in self:
-            # TODO: move to the DataPoint/DataSlot?
             item.data.pop(data_label, None)
 
     def remove_data_index(self, data_index):
@@ -923,9 +900,6 @@ def remove_data_loss(self):
         for item in self:
             item.data_indexes.pop('data_loss', None)
 
-
-
-
     #=========================
     #  Operations
     #=========================
@@ -1206,7 +1180,7 @@ def __repr__(self):
     def __init__(self, *args, **kwargs):
 
 
-        # Handle timezone
+        # Handle time zone
         tz = kwargs.pop('tz', None)
         if tz:
             self._tz = timezonize(tz)
@@ -1231,13 +1205,10 @@ def append(self, item):
             except AttributeError:
                 pass
             else:
-                # Check time ordering and handle the resolution.
-
-                # The following if is to support the deepcopy, otherwise the original prev_t will be used
-                # TODO: maybe move the above to a "hasattr" plus an "and" instead of this logic?
+                # Check time ordering and handle the resolution. It is done in this way to support
+                # the deepcopy, otherwise the original prev_t will be used.
                 if len(self)>0:
 
-                    # logger.debug('Checking time ordering for t="%s" (prev_t="%s")', item.t, self.prev_t)
                     if item.t < self.prev_t:
                         raise ValueError('Time t="{}" is out of order (prev t="{}")'.format(item.t, self.prev_t))
 
@@ -1265,7 +1236,7 @@ def append(self, item):
                                 del self._resolution_as_seconds
                                 self._resolution = 'variable'
             finally:
-                # Delete the autodetected sampling interval cache if present
+                # Delete the auto-detected sampling interval cache if present
                 try:
                     del self._autodetected_sampling_interval
                     del self._autodetected_sampling_interval_confidence
@@ -1276,18 +1247,18 @@ def append(self, item):
 
         elif isinstance(item, TimeSlot):
 
-            # Slots can belong to the same series if they are in succession (tested with the __succedes__ method)
+            # Slots can belong to the same series if they are in succession (checked with the __succedes__ method)
             # and if they have the same unit, which we test here instead as the __succedes__ is more general.
 
-            # Check the timezone (only for slots, points are not affected by timezones)
+            # Check the time zone (only for slots, points are not affected by time zones)
             if not self.tz:
-                # If no timezone set, use the item one's
+                # If no time zone set, use the item's one
                 self._tz = item.tz
 
             else:
-                # Else, check for the same timezone
+                # Else, check for the same time zone
                 if self._tz != item.tz:
-                    raise ValueError('Cannot append slots on different timezones (I have "{}" and you tried to add "{}")'.format(self.tz, item.start.tz))
+                    raise ValueError('Cannot append slots on different time zones (I have "{}" and you tried to add "{}")'.format(self.tz, item.start.tz))
 
             try:
                 if self._resolution != item.unit:
@@ -1310,7 +1281,7 @@ def append(self, item):
         super(TimeSeries, self).append(item)
 
     def _item_by_t(self, t):
-        # TODO: improve performance, bisection first, then use an index?
+        # TODO: improve performance here. Bisection first, then maybe use an index-based mapping?
         for item in self:
             if item.t == t:
                 return item
@@ -1351,25 +1322,24 @@ def __getitem__(self, arg):
             return self.get(arg)
 
     #=========================
-    #  Timezone-related
+    #  Time zone-related
     #=========================
 
     @property
     def tz(self):
-        """The timezone of the time series."""
-        # Note: we compute the tz on the fly because for point time series we assume to use the tz
-        # attribute way lass than the slot time series, where the tz is instead computed at append-time.
+        """The time zone of the time series."""
         try:
             return self._tz
         except AttributeError:
-            # Detect timezone on the fly
-            # TODO: this ensures that each point is on the same timezone. Do we want this?
+            # Detect time zone on the fly. Only applies for point time series.
+            # If different time zones are mixed, than fall back on UTC.
+            # TODO: set the tz at append-time for point time series as well?
             detected_tz = None
             for item in self:
                 if not detected_tz:
                     detected_tz = item.tz
                 else:
-                    # Terrible but seems like no other way to compare pytz.tzfile.* classes
+                    # Terrible, but there seems to be no other way to compare pytz.tzfile.* classes
                     if str(item.tz) != str(detected_tz):
                         return UTC
             return detected_tz
@@ -1505,7 +1475,7 @@ def load(cls, file_name):
         if loaded_series.__class__ == cls:
             return loaded_series
         else:
-            # TODO: the following is a huge performance hit...
+            # TODO: improve performance here, the following is highly inefficient.
             series_items = loaded_series.contents()
             cls(*series_items)
 
@@ -1597,7 +1567,7 @@ def from_csv(cls, file_name, *args, **kwargs):
         if loaded_series.__class__ == cls:
             return loaded_series
         else:
-            # TODO: the following is a huge performance hit...
+            # TODO: improve performance here, the following is highly inefficient.
             series_items = loaded_series.contents()
             cls(*series_items)
 
@@ -1620,7 +1590,7 @@ def from_df(cls, df, item_type='auto'):
 
         if not unit_str_pd:
             if not item_type:
-                logger.info('Cannot infer the freqency of the dataframe, will just create points')
+                logger.info('Cannot infer the frequency of the dataframe, will just create points')
                 item_type = DataTimePoint
 
         else:

diff --git a/timeseria/interpolators.py b/timeseria/interpolators.py
@@ -61,20 +61,13 @@ def evaluate(self, at, prev_i=None, next_i=None):
 
         for label in self.series.data_labels():
 
-            if False:
-                # TODO: check the math here, this should be a better approach but tests fail...
-                coordinate_increment = at - prev_point.t
-                value_diff = next_point.data[label] - prev_point.data[label]
-                interpolated_data[label] = prev_point.data[label] + (value_diff * coordinate_increment)
-
-            else:
-                # Compute the "growth" ratio
-                diff = next_point.data[label] - prev_point.data[label]
-                delta_t = next_point.t - prev_point.t
-                ratio = diff / delta_t
-
-                # Compute the value of the data for the new point
-                interpolated_data[label] = prev_point.data[label] + ((at-prev_point.t)*ratio)
+            # Compute the "growth" ratio
+            diff = next_point.data[label] - prev_point.data[label]
+            delta_t = next_point.t - prev_point.t
+            ratio = diff / delta_t
+
+            # Compute the value of the data for the new point
+            interpolated_data[label] = prev_point.data[label] + ((at-prev_point.t)*ratio)
 
 
         return interpolated_data

diff --git a/timeseria/logger.py b/timeseria/logger.py
@@ -14,7 +14,6 @@
 def setup(level=LOGLEVEL, force=False):
     timeseria_logger = logging.getLogger('timeseria')
     timeseria_logger.propagate = False
-    #print('Setting log level to "{}"'.format(level))
     try:
         configured = False
         for handler in timeseria_logger.handlers: