Merge pull request #24 from opesci/compressionV2

Compression
devitocodes · Mar 13, 2019 · 86d884e · 86d884e
2 parents 77dad44 + 2a287df
commit 86d884e
Show file tree

Hide file tree

Showing 15 changed files with 720 additions and 105 deletions.
diff --git a/examples/use_modernised.py b/examples/use_modernised.py
@@ -61,25 +61,11 @@ def __init__(self, symbols):
             raise Exception("Symbols must be a Mapping, for example a \
                               dictionary.")
 
-    def save(self, ptr):
-        """Overwrite live-data in this Checkpoint object with data found at
-        the ptr location."""
-        i_ptr_lo = 0
-        i_ptr_hi = 0
-        for i in self.symbols:
-            i_ptr_hi = i_ptr_hi + self.symbols[i].size
-            ptr[i_ptr_lo:i_ptr_hi] = self.symbols[i].data[:]
-            i_ptr_lo = i_ptr_hi
-
-    def load(self, ptr):
-        """Copy live-data from this Checkpoint object into the memory given by
-        the ptr."""
-        i_ptr_lo = 0
-        i_ptr_hi = 0
-        for i in self.symbols:
-            i_ptr_hi = i_ptr_hi + self.symbols[i].size
-            self.symbols[i].data[:] = ptr[i_ptr_lo:i_ptr_hi]
-            i_ptr_lo = i_ptr_hi
+    def get_data_location(self, timestep):
+        return [x.data for x in list(self.symbols.values())]
+
+    def get_data(self, timestep):
+        return [x.data for x in self.symbols.values()]
 
     @property
     def size(self):

diff --git a/pyrevolve/compression.py b/pyrevolve/compression.py
@@ -0,0 +1,106 @@
+import blosc
+import pyzfp
+import numpy as np
+from contexttimer import Timer
+from functools import partial
+import pickle
+
+
+DEFAULTS = {None: {}, 'blosc': {'chunk_size': 1000000},
+            'zfp': {'tolerance': 0.0000001, 'parallel': True}}
+
+
+def init_compression(params):
+    params = params.copy()
+    scheme = params.pop('scheme', None)
+    if scheme == 'custom':
+        compressor = params.pop('compressor', None)
+        decompressor = params.pop('decompressor', None)
+    else:
+        compressor = compressors[scheme]
+        decompressor = decompressors[scheme]
+        default_values = DEFAULTS[scheme]
+        for k, v in default_values.items():
+            if k not in params:
+                params[k] = v
+    part_compressor = partial(compressor, params)
+    part_decompressor = partial(decompressor, params)
+    return part_compressor, part_decompressor
+
+
+def no_compression_in(params, indata):
+    return CompressedObject(memoryview(indata.tobytes()), shape=indata.shape,
+                            dtype=indata.dtype)
+
+
+def no_compression_out(params, indata):
+    return np.frombuffer(indata.data, dtype=indata.dtype).reshape(indata.shape)
+
+
+def blosc_compress(params, indata):
+    s = indata.tostring()
+    chunk_size = params.get('chunk_size')
+    chunked = [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+    time = 0
+    size = 0
+    compressed = bytes()
+    chunk_sizes = []
+    for chunk in chunked:
+        with Timer(factor=1000) as t:
+            c = blosc.compress(chunk)
+        compressed += c
+        time += t.elapsed
+        size += len(c)
+        chunk_sizes.append(len(c))
+    metadata = {'shape': indata.shape, 'dtype': indata.dtype,
+                'chunks': chunk_sizes}
+    return CompressedObject(data=compressed, metadata=metadata)
+
+
+def blosc_decompress(params, indata):
+    compressed = indata.data
+    chunk_sizes = indata.metadata['chunks']
+
+    ptr = 0
+    decompressed = bytes()
+    for s in chunk_sizes:
+        c = compressed[ptr:(ptr + s)]
+        d = blosc.decompress(c)
+        decompressed += d
+        ptr += s
+    return np.frombuffer(decompressed,
+                         dtype=indata.dtype).reshape(indata.shape)
+
+
+class CompressedObject(object):
+    def __init__(self, data, shape=None, dtype=None, metadata=None):
+        assert(metadata is None or (shape is None and dtype is None))
+        if metadata is not None:
+            assert('shape' in metadata and 'dtype' in metadata)
+            shape = metadata['shape']
+            dtype = metadata['dtype']
+        else:
+            metadata = {'shape': shape, 'dtype': dtype}
+        self.shape = shape
+        self.dtype = dtype
+        self.data = data
+        self.metadata = metadata
+        self.pickled_metadata = pickle.dumps(self.metadata)
+
+
+def zfp_compress(params, indata):
+    return CompressedObject(memoryview(pyzfp.compress(indata, **params)),
+                            shape=indata.shape, dtype=indata.dtype)
+
+
+def zfp_decompress(params, indata):
+    assert(isinstance(indata, CompressedObject))
+    return pyzfp.decompress(indata.data, indata.shape, indata.dtype,
+                            **params)
+
+
+compressors = {None: no_compression_in, 'blosc': blosc_compress,
+               'zfp': zfp_compress}
+decompressors = {None: no_compression_out, 'blosc': blosc_decompress,
+                 'zfp': zfp_decompress}
+allowed_names = [None, 'blosc', 'zfp']
diff --git a/pyrevolve/crevolve.pyx b/pyrevolve/crevolve.pyx
@@ -3,6 +3,9 @@ cimport revolve_c
 from enum import Enum
 import warnings
 
+from tools import OutputGrabber
+
+
 class RevolveError(Exception):
     pass
     # TODO: the hardcoded limits really should be removed in a future version. This should be as easy as replacing the arrays in the C++ code with an std::vector.
@@ -78,7 +81,8 @@ cdef class CRevolve(object):
 
     def revolve(self):
         cdef revolve_c.CACTION action
-        action = revolve_c.revolve(self.__r)
+        with OutputGrabber() as og:
+             action = revolve_c.revolve(self.__r)
         if(action == revolve_c.CACTION_ADVANCE):
             retAction = Action.advance
         elif(action == revolve_c.CACTION_TAKESHOT):

diff --git a/pyrevolve/custom_pickle.py b/pyrevolve/custom_pickle.py
@@ -0,0 +1,19 @@
+import pickle
+import numpy as np
+
+
+def dumps(data):
+    if isinstance(data, np.ndarray):
+        data = {'data': data.tobytes(), 'shape': data.shape,
+                'dtype': data.dtype, 'creator': 'custom_pickle'}
+    return pickle.dumps(data)
+
+
+def loads(data):
+    outdata = pickle.loads(data)
+    if isinstance(outdata, dict) \
+       and 'creator' in outdata  \
+       and outdata['creator'] == 'custom_pickle':
+        outdata = np.frombuffer(outdata['data'], dtype=outdata['dtype'])
+        outdata = outdata.reshape(outdata['shape'])
+    return outdata
diff --git a/pyrevolve/logger.py b/pyrevolve/logger.py
@@ -0,0 +1,13 @@
+import logging
+
+logger = logging.getLogger("pyRevolve")
+logger.setLevel(logging.DEBUG)
+
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+# create formatter and add it to the handlers
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # noqa
+
+ch.setFormatter(formatter)
+# add the handlers to the logger
+logger.addHandler(ch)
diff --git a/pyrevolve/profiling.py b/pyrevolve/profiling.py
@@ -0,0 +1,61 @@
+from timeit import default_timer
+
+
+class Timer(object):
+    def __init__(self, profiler, section, action):
+        self.timer = default_timer
+        self.profiler = profiler
+        self.section = section
+        self.action = action
+
+    def __enter__(self):
+        self.start = self.timer()
+        return self
+
+    def __exit__(self, *args):
+        end = self.timer()
+        self.elapsed_secs = end - self.start
+        self.elapsed = self.elapsed_secs * 1000  # millisecs
+        self.profiler.increment(self.section, self.action, self.elapsed)
+
+
+class Profiler(object):
+    def __init__(self):
+        self.timings = {}
+        self.counts = {}
+
+    def get_timer(self, section, action):
+        return Timer(self, section, action)
+
+    def increment(self, section, action, elapsed):
+        # Warning: Not thread safe
+        section_timings = self.timings.get(section, {})
+        section_timings[action] = section_timings.get(action, 0) + elapsed
+        self.timings[section] = section_timings
+
+        section_counts = self.counts.get(section, {})
+        section_counts[action] = section_counts.get(action, 0) + 1
+        self.counts[section] = section_counts
+
+    def summary(self):
+        summary = '****************'
+        for section, section_timings in self.timings.items():
+            summary += '\nIn section %s:' % section
+            for action, action_time in section_timings.items():
+                summary += '\n\tAction %s: %f (%d)' \
+                           % (action, action_time,
+                              self.counts[section][action])
+        summary += '\n****************'
+        return summary
+
+    def get_dict(self):
+        results = {}
+        for s_n, s_dict in self.timings.items():
+            for a_n, a_time in s_dict.items():
+                results['%s_%s_timing' % (s_n, a_n)] = a_time
+
+        for s_n, s_dict in self.counts.items():
+            for a_n, a_time in s_dict.items():
+                results['%s_%s_counts' % (s_n, a_n)] = a_time
+
+        return results