From 263d4bdc38d51fc3e75fd8710353ed2d57e405a1 Mon Sep 17 00:00:00 2001
From: Hristo Georgiev <hristo.a.georgiev@gmail.com>
Date: Wed, 7 Apr 2021 16:29:57 +0000
Subject: [PATCH 1/2] Introduce `safer_eval`

---
 partd/numpy.py             |  9 ++++---
 partd/tests/test_numpy.py  | 53 +++++++++++++++++++++++++++++++++++---
 partd/tests/test_pandas.py | 11 +++++---
 partd/tests/test_utils.py  | 20 +++++++++++++-
 partd/utils.py             | 44 ++++++++++++++++++++++++++++---
 5 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/partd/numpy.py b/partd/numpy.py
index ee28052..31e5c1c 100644
--- a/partd/numpy.py
+++ b/partd/numpy.py
@@ -5,12 +5,15 @@
 description of the array's dtype.
 """
 from __future__ import absolute_import
+
+from toolz import identity, partial, valmap
+
 import numpy as np
-from toolz import valmap, identity, partial
+
 from .compatibility import pickle
 from .core import Interface
 from .file import File
-from .utils import frame, framesplit, suffix, ignoring
+from .utils import frame, framesplit, ignoring, safer_eval, suffix
 
 
 def serialize_dtype(dt):
@@ -34,7 +37,7 @@ def parse_dtype(s):
     dtype([('a', '<i4')])
     """
     if s.startswith(b'['):
-        return np.dtype(eval(s))  # Dangerous!
+        return np.dtype(safer_eval(s))
     else:
         return np.dtype(s)
 
diff --git a/partd/tests/test_numpy.py b/partd/tests/test_numpy.py
index fa8eec2..1c2ff34 100644
--- a/partd/tests/test_numpy.py
+++ b/partd/tests/test_numpy.py
@@ -1,12 +1,16 @@
 from __future__ import absolute_import
 
+import pickle
+
 import pytest
+
+import partd
+from partd.numpy import Numpy, parse_dtype
+from partd.utils import safer_eval
+
 np = pytest.importorskip('numpy')  # noqa
 
-import pickle
 
-import partd
-from partd.numpy import Numpy
 
 
 def test_numpy():
@@ -70,3 +74,46 @@ def test_non_utf8_bytes():
                   b'\xf0\x28\x8c\xbc'], dtype='O')
     s = partd.numpy.serialize(a)
     assert (partd.numpy.deserialize(s, 'O') == a).all()
+
+
+def test_safer_eval_tuples():
+    # Test different quotation mark types.
+    assert np.dtype(safer_eval(b'[("a", "i4")]')) == np.dtype([('a', '<i4')])
+
+    assert np.dtype(safer_eval(b"[('a', 'i4')]")) == np.dtype([('a', '<i4')])
+    assert np.dtype(safer_eval(b"[('b', 'i2')]")) == np.dtype([('b', '<i2')])
+    assert np.dtype(safer_eval(b"[('c', 'f8')]")) == np.dtype([('c', '<f8')])
+
+    assert (
+        np.dtype(safer_eval(b"[('x', 'i4'), ('y', 'i4')]")) ==
+        np.dtype([('x', '<i4'), ('y', '<i4')])
+    )
+
+    assert (
+        np.dtype(safer_eval(b"[('a', 'i4'), ('b', 'i2'), ('c', 'f8')]")) ==
+        np.dtype([('a', '<i4'), ('b', '<i2'), ('c', '<f8')])
+    )
+
+
+@pytest.mark.parametrize('text,parsed', [
+    (b'a', 'S'),
+    (b'b', 'int8'),
+    (b'c', 'S1'),
+    (b'i2', 'int16'),
+    (b'i4', 'int32'),
+    (b'f8', 'float64'),
+    (b'M8[us]', '<M8[us]'),
+    (b'M8[s]', '<M8[s]'),
+    (b'datetime64[D]', '<M8[D]'),
+    (b'timedelta64[25s]', '<m8[25s]'),
+    (
+        b"i4, (2,3)f8",
+        [('f0', '<i4'), ('f1', '<f8', (2, 3))],
+    ),
+    (
+        b"[('a', 'i4'), ('b', 'i2'), ('c', 'f8')]",
+        [('a', '<i4'), ('b', '<i2'), ('c', '<f8')],
+    ),
+])
+def test_parse_dtype(text, parsed):
+    assert parse_dtype(text) == np.dtype(parsed)
diff --git a/partd/tests/test_pandas.py b/partd/tests/test_pandas.py
index 64f67c9..62fa2d4 100644
--- a/partd/tests/test_pandas.py
+++ b/partd/tests/test_pandas.py
@@ -1,14 +1,17 @@
 from __future__ import absolute_import
 
+import os
+
 import pytest
-pytest.importorskip('pandas')  # noqa
 
 import numpy as np
 import pandas as pd
-import pandas.util.testing as tm
-import os
+import pandas.testing as tm
+from partd.pandas import PandasBlocks, PandasColumns, deserialize, serialize
+
+pytest.importorskip('pandas')  # noqa
+
 
-from partd.pandas import PandasColumns, PandasBlocks, serialize, deserialize
 
 
 df1 = pd.DataFrame({'a': [1, 2, 3],
diff --git a/partd/tests/test_utils.py b/partd/tests/test_utils.py
index 7bf0413..56c9a18 100644
--- a/partd/tests/test_utils.py
+++ b/partd/tests/test_utils.py
@@ -1,6 +1,9 @@
-from partd.utils import frame, framesplit
 import struct
 
+import pytest
+
+from partd.utils import frame, framesplit, safer_eval
+
 
 def test_frame():
     assert frame(b'Hello') == struct.pack('Q', 5) + b'Hello'
@@ -9,3 +12,18 @@ def test_frame():
 def test_framesplit():
     L = [b'Hello', b'World!', b'123']
     assert list(framesplit(b''.join(map(frame, L)))) == L
+
+
+def test_safer_eval_safe():
+    assert safer_eval("[1, 2, 3]") == [1, 2, 3]
+    assert safer_eval("['a', 'b', 'c']") == ['a', 'b', 'c']
+
+
+def test_safer_eval_unsafe():
+    with pytest.raises(ValueError) as excinfo:
+        safer_eval("\xe1")
+    assert "non-printable" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        safer_eval("__import__('os').system('ls')")
+    assert "__" in str(excinfo.value)
diff --git a/partd/utils.py b/partd/utils.py
index 004f176..71d36f8 100644
--- a/partd/utils.py
+++ b/partd/utils.py
@@ -1,8 +1,12 @@
-from contextlib import contextmanager
 import os
 import shutil
-import tempfile
 import struct
+import tempfile
+from contextlib import contextmanager
+from string import printable as _printable
+
+# Exclude newline and tab characters from consideration.
+printable = _printable[:-5]
 
 
 def raises(exc, lamda):
@@ -48,7 +52,6 @@ def framesplit(bytes):
     [b'Hello', b'World']
     """
     i = 0; n = len(bytes)
-    chunks = list()
     while i < n:
         nbytes = struct.unpack('Q', bytes[i:i+8])[0]
         i += 8
@@ -174,3 +177,38 @@ def extend(key, term):
         key = (key,)
 
     return key + term
+
+
+def safer_eval(source):
+    """ A safer alternative to the built-in ``eval``
+
+    The further safety is achieved via additional checks over the input.
+
+    Please, note that this is not 100% bullet-proof, as it still internally
+    relies on ``eval``.
+
+    Examples
+    --------
+
+    >>> safer_eval("1")
+    1
+    >>> safer_eval("[1, 2, 3]")
+    [1, 2, 3]
+    >>> safer_eval("['a', 'b', 'c']")
+    ['a', 'b', 'c']
+    """
+    # Preserve the original type, if it's not ``str``, but ensure that sanity
+    # checks are performed over a ``str`` representation of the input.
+    string = source if type(source) is str else str(source)
+
+    # Disallow evaluation of non-printable chracters.
+    if any(map(lambda c: c not in printable, string)):
+        raise ValueError("Cannot evaluate strings containing non-printable characters")
+
+    # Disallow evaluation of dunder/magic Python methods.
+    # Access to the latter may recover ``__builtins__``.
+    if '__' in string:
+        raise ValueError("Cannot evaluate strings containing '__'")
+
+    # Disallow ``__builtins__`` (e.g., ``__import__``, etc.).
+    return eval(source, {'__builtins__': {}})

From 4740f139f31725097ac085c31d2aa71bba63c619 Mon Sep 17 00:00:00 2001
From: Hristo Georgiev <hristo.a.georgiev@gmail.com>
Date: Wed, 7 Apr 2021 16:49:30 +0000
Subject: [PATCH 2/2] Employ `@pytest.mark.parametrize` in
 `test_safer_eval_tuple`

---
 partd/tests/test_numpy.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/partd/tests/test_numpy.py b/partd/tests/test_numpy.py
index 1c2ff34..3f3f0a9 100644
--- a/partd/tests/test_numpy.py
+++ b/partd/tests/test_numpy.py
@@ -76,23 +76,19 @@ def test_non_utf8_bytes():
     assert (partd.numpy.deserialize(s, 'O') == a).all()
 
 
-def test_safer_eval_tuples():
-    # Test different quotation mark types.
-    assert np.dtype(safer_eval(b'[("a", "i4")]')) == np.dtype([('a', '<i4')])
-
-    assert np.dtype(safer_eval(b"[('a', 'i4')]")) == np.dtype([('a', '<i4')])
-    assert np.dtype(safer_eval(b"[('b', 'i2')]")) == np.dtype([('b', '<i2')])
-    assert np.dtype(safer_eval(b"[('c', 'f8')]")) == np.dtype([('c', '<f8')])
-
-    assert (
-        np.dtype(safer_eval(b"[('x', 'i4'), ('y', 'i4')]")) ==
-        np.dtype([('x', '<i4'), ('y', '<i4')])
-    )
-
-    assert (
-        np.dtype(safer_eval(b"[('a', 'i4'), ('b', 'i2'), ('c', 'f8')]")) ==
-        np.dtype([('a', '<i4'), ('b', '<i2'), ('c', '<f8')])
-    )
+@pytest.mark.parametrize('text,parsed', [
+    (b'[("a", "i4")]', [('a', '<i4')]), # Test different quotation mark types.
+    (b"[('a', 'i4')]", [('a', '<i4')]),
+    (b"[('b', 'i2')]", [('b', '<i2')]),
+    (b"[('c', 'f8')]", [('c', '<f8')]),
+    (b"[('x', 'i4'), ('y', 'i4')]", [('x', '<i4'), ('y', '<i4')]),
+    (
+        b"[('a', 'i4'), ('b', 'i2'), ('c', 'f8')]",
+        [('a', '<i4'), ('b', '<i2'), ('c', '<f8')],
+    ),
+])
+def test_safer_eval_tuple(text, parsed):
+    assert np.dtype(safer_eval(text)) == np.dtype(parsed)
 
 
 @pytest.mark.parametrize('text,parsed', [