From e85c4c50a0c8b5d15ff4e2148f664c02a7543679 Mon Sep 17 00:00:00 2001
From: Harel Ben-Attia <harelba@gmail.com>
Date: Sun, 13 Sep 2020 17:29:53 +0300
Subject: [PATCH] option to list udfs, and added new functions

---
 bin/q.py             | 180 +++++++++++++++++++++++++++++++++++++++----
 mkdocs/docs/index.md |   8 +-
 test/test-suite      | 146 ++++++++++++++++++++++++++++-------
 3 files changed, 290 insertions(+), 44 deletions(-)

diff --git a/bin/q.py b/bin/q.py
index e2e9fb8d..5e165c85 100755
--- a/bin/q.py
+++ b/bin/q.py
@@ -31,6 +31,8 @@
 from __future__ import division
 from __future__ import print_function
 
+from collections import OrderedDict
+
 q_version = '2.0.16'
 
 __all__ = [ 'QTextAsData' ]
@@ -72,11 +74,26 @@ def get_stdout_encoding(encoding_override=None):
 
 SHOW_SQL = False
 
-def sha1(data):
-    if not isinstance(data,str) and not isinstance(data,unicode):
-        return hashlib.sha1(str(data)).hexdigest()
-    return hashlib.sha1(data).hexdigest()
+sha_algorithms = {
+    1 : hashlib.sha1,
+    224: hashlib.sha224,
+    256: hashlib.sha256,
+    386: hashlib.sha384,
+    512: hashlib.sha512
+}
+
+def sha(data,algorithm,encoding):
+    try:
+        f = sha_algorithms[algorithm]
+        return f(six.text_type(data).encode(encoding)).hexdigest()
+    except Exception as e:
+        print(e)
+
+# For backward compatibility
+def sha1(data,encoding):
+    return sha(data,1,encoding)
 
+# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
 def regexp(regular_expression, data):
     if data is not None:
         if not isinstance(data, str) and not isinstance(data, unicode):
@@ -85,15 +102,16 @@ def regexp(regular_expression, data):
     else:
         return False
 
-def md5(data,encoding='utf-8'):
+def md5(data,encoding):
     m = hashlib.md5()
     m.update(six.text_type(data).encode(encoding))
     return m.hexdigest()
 
-class Sqlite3DBResults(object):
-    def __init__(self,query_column_names,results):
-        self.query_column_names = query_column_names
-        self.results = results
+def sqrt(data):
+    return math.sqrt(data)
+
+def power(data,p):
+    return data**p
 
 def percentile(l, p):
     # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
@@ -106,6 +124,7 @@ def percentile(l, p):
         return l[int(k)]
     return (c-k) * l[int(f)] + (k-f) * l[int(c)]
 
+# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
 class StrictPercentile(object):
     def __init__(self):
         self.values = []
@@ -121,6 +140,130 @@ def finalize(self):
         else:
             return percentile(sorted(self.values),self.p)
 
+class StdevPopulation(object):
+    def __init__(self):
+        self.M = 0.0
+        self.S = 0.0
+        self.k = 0
+
+    def step(self, value):
+        try:
+            # Ignore nulls
+            if value is None:
+                return
+            val = float(value) # if fails, skips this iteration, which also ignores nulls
+            tM = self.M
+            self.k += 1
+            self.M += ((val - tM) / self.k)
+            self.S += ((val - tM) * (val - self.M))
+        except ValueError:
+            # TODO propagate udf errors to console
+            raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+    def finalize(self):
+        if self.k <= 1: # avoid division by zero
+            return None
+        else:
+            return math.sqrt(self.S / (self.k))
+
+class StdevSample(object):
+    def __init__(self):
+        self.M = 0.0
+        self.S = 0.0
+        self.k = 0
+
+    def step(self, value):
+        try:
+            # Ignore nulls
+            if value is None:
+                return
+            val = float(value) # if fails, skips this iteration, which also ignores nulls
+            tM = self.M
+            self.k += 1
+            self.M += ((val - tM) / self.k)
+            self.S += ((val - tM) * (val - self.M))
+        except ValueError:
+            # TODO propagate udf errors to console
+            raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+    def finalize(self):
+        if self.k <= 1: # avoid division by zero
+            return None
+        else:
+            return math.sqrt(self.S / (self.k-1))
+
+class FunctionType(object):
+    REGULAR = 1
+    AGG = 2
+
+class UserFunctionDef(object):
+    def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
+        self.func_type = func_type
+        self.name = name
+        self.usage = usage
+        self.description = description
+        self.func_or_obj = func_or_obj
+        self.param_count = param_count
+
+user_functions = [
+    UserFunctionDef(FunctionType.REGULAR,
+                    "regexp","regexp(<regular_expression>,<expr>) = <1|0>",
+                    "Find regexp in string expression. Returns 1 if found or 0 if not",
+                    regexp,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sha","sha(<expr>,<encoding>,<algorithm>) = <hex-string-of-sha>",
+                    "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
+                    sha,
+                    3),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sha1","sha1(<expr>,<encoding>) = <hex-string-of-sha>",
+                    "Calculate sha1 of some expression. For now encoding must be manually provided. Will be taken automatically from the input encoding in the future.",
+                    sha1,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "md5","md5(<expr>,<encoding>) = <hex-string-of-md5>",
+                    "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
+                    md5,
+                    2),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "sqrt","sqrt(<expr>) = <square-root>",
+                    "Calculate the square root of the expression",
+                    sqrt,
+                    1),
+    UserFunctionDef(FunctionType.REGULAR,
+                    "power","power(<expr1>,<expr2>) = <expr1-to-the-power-of-expr2>",
+                    "Raise expr1 to the power of expr2",
+                    power,
+                    2),
+    UserFunctionDef(FunctionType.AGG,
+                    "percentile","percentile(<expr>,<percentile-in-the-range-0-to-1>) = <percentile-value>",
+                    "Calculate the strict percentile of a set of a values.",
+                    StrictPercentile,
+                    2),
+    UserFunctionDef(FunctionType.AGG,
+                    "stddev_pop","stddev_pop(<expr>) = <stddev-value>",
+                    "Calculate the population standard deviation of a set of values",
+                    StdevPopulation,
+                    1),
+    UserFunctionDef(FunctionType.AGG,
+                    "stddev_sample","stddev_sample(<expr>) = <stddev-value>",
+                    "Calculate the sample standard deviation of a set of values",
+                    StdevSample,
+                    1)
+]
+
+def print_user_functions():
+    for udf in user_functions:
+        print("Function: %s" % udf.name)
+        print("     Usage: %s" % udf.usage)
+        print("     Description: %s" % udf.description)
+
+class Sqlite3DBResults(object):
+    def __init__(self,query_column_names,results):
+        self.query_column_names = query_column_names
+        self.results = results
+
 class Sqlite3DB(object):
 
     def __init__(self, show_sql=SHOW_SQL):
@@ -169,11 +312,13 @@ def store_db_to_disk(self,sqlite_db_filename,table_names_mapping,method='standar
             raise ValueError('Unknown store-db-to-disk method %s' % method)
 
     def add_user_functions(self):
-        self.conn.create_function("regexp", 2, regexp)
-        self.conn.create_function("sha1", 1, sha1)
-        self.conn.create_function("md5", 2, md5)
-        self.conn.create_function("md5", 1, md5)
-        self.conn.create_aggregate("percentile",2,StrictPercentile)
+        for udf in user_functions:
+            if type(udf.func_or_obj) == type(object):
+                self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
+            elif type(udf.func_or_obj) == type(md5):
+                self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
+            else:
+                raise Exception("Invalid user function definition %s" % str(udf))
 
     def is_numeric_type(self, column_type):
         return column_type in self.numeric_column_types
@@ -1791,6 +1936,8 @@ def get_option_with_default(p, option_type, option, default):
                       help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
     output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
                       help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
+    output_data_option_group.add_option("-L","--list-user-functions",dest="list_user_functions",default=False,action="store_true",
+                      help="List all user functions")
     parser.add_option_group(output_data_option_group)
     #-----------------------------------------------
     query_option_group = OptionGroup(parser,"Query Related Options")
@@ -1808,6 +1955,11 @@ def get_option_with_default(p, option_type, option, default):
         sys.exit(0)
 
 ###
+
+    if options.list_user_functions:
+        print_user_functions()
+        sys.exit(0)
+
     if len(args) == 0 and options.query_filename is None:
         print_credentials()
         print("Must provide at least one query in the command line, or through a file with the -q parameter", file=sys.stderr)
diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
index e450b0a9..e2b37439 100644
--- a/mkdocs/docs/index.md
+++ b/mkdocs/docs/index.md
@@ -89,7 +89,7 @@ Usage:
 
         Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
 
-        Basic usage is q "<sql-like query>" where table names are just regular file names (Use - to read from standard input)
+        Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input)
             When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN.
 
         Column types are detected automatically. Use -A in order to see the column name/type analysis.
@@ -133,6 +133,8 @@ Options:
     -d DELIMITER, --delimiter=DELIMITER
                         Field delimiter. If none specified, then space is used
                         as the delimiter.
+    -p, --pipe-delimited
+                        Same as -d '|'. Added for convenience and readability
     -t, --tab-delimited
                         Same as -d <tab>. Just a shorthand for handling
                         standard tab delimited file You can use $'\t' if you
@@ -186,6 +188,8 @@ Options:
                         Field delimiter for output. If none specified, then
                         the -d delimiter is used if present, or space if no
                         delimiter is specified
+    -P, --pipe-delimited-output
+                        Same as -D '|'. Added for convenience and readability.
     -T, --tab-delimited-output
                         Same as -D <tab>. Just a shorthand for outputting tab
                         delimited output. You can use -D $'\t' if you want.
@@ -210,6 +214,8 @@ Options:
                         nonnumeric and none. Note the slightly misleading
                         parameter name, and see the matching -w parameter for
                         input quoting.
+    -L, --list-user-functions
+                        List all user functions
 
   Query Related Options:
     -q QUERY_FILENAME, --query-filename=QUERY_FILENAME
diff --git a/test/test-suite b/test/test-suite
index 4f1f877b..a5837701 100755
--- a/test/test-suite
+++ b/test/test-suite
@@ -11,7 +11,6 @@
 #
 
 import unittest
-import pytest
 import random
 import json
 from json import JSONEncoder
@@ -283,34 +282,6 @@ class BasicTests(AbstractQTestCase):
 
         self.cleanup(tmpfile)
 
-    def test_regexp_int_data_handling(self):
-        tmpfile = self.create_file_with_data(sample_data_no_header)
-
-        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
-        retcode, o, e = run_command(cmd)
-
-        self.assertEqual(retcode, 0)
-        self.assertEqual(len(o), 1)
-        self.assertEqual(len(e), 0)
-
-        self.assertEqual(o[0],six.b("1"))
-
-        self.cleanup(tmpfile)
-
-    def test_regexp_null_data_handling(self):
-        tmpfile = self.create_file_with_data(sample_data_no_header)
-
-        cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
-        retcode, o, e = run_command(cmd)
-
-        self.assertEqual(retcode, 0)
-        self.assertEqual(len(o), 1)
-        self.assertEqual(len(e), 0)
-
-        self.assertEqual(o[0],six.b("2"))
-
-        self.cleanup(tmpfile)
-
     def test_select_one_column(self):
         tmpfile = self.create_file_with_data(sample_data_no_header)
 
@@ -1525,6 +1496,55 @@ class BasicTests(AbstractQTestCase):
 
 
 class UserFunctionTests(AbstractQTestCase):
+    def test_regexp_int_data_handling(self):
+        tmpfile = self.create_file_with_data(sample_data_no_header)
+
+        cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 1)
+        self.assertEqual(len(e), 0)
+
+        self.assertEqual(o[0],six.b("1"))
+
+        self.cleanup(tmpfile)
+
+    def test_percentile_func(self):
+        cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 10)
+        self.assertEqual(len(e), 0)
+
+        output_table = [l.split(six.b(" ")) for l in o]
+        group_labels = [int(row[0]) for row in output_table]
+        minimum_values = [float(row[1]) for row in output_table]
+        median_values = [float(row[2]) for row in output_table]
+        max_values = [float(row[3]) for row in output_table]
+
+        base_values = list(range(1000,2000,100))
+
+        self.assertEqual(group_labels,list(range(10,20)))
+        self.assertEqual(minimum_values,base_values)
+        self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
+        self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
+
+    def test_regexp_null_data_handling(self):
+        tmpfile = self.create_file_with_data(sample_data_no_header)
+
+        cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode, 0)
+        self.assertEqual(len(o), 1)
+        self.assertEqual(len(e), 0)
+
+        self.assertEqual(o[0],six.b("2"))
+
+        self.cleanup(tmpfile)
+
     def test_md5_function(self):
         cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
         retcode, o, e = run_command(cmd)
@@ -1538,6 +1558,74 @@ class UserFunctionTests(AbstractQTestCase):
         self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
         self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))
 
+    def test_stddev_functions(self):
+        tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
+
+        cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),1)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],'1479.7015464838,1569.4604964764')
+
+        self.cleanup(tmpfile)
+
+    def test_sqrt_function(self):
+        cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),5)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1.0'))
+        self.assertEqual(o[1],six.b('1.4142135624'))
+        self.assertEqual(o[2],six.b('1.7320508076'))
+        self.assertEqual(o[3],six.b('2.0'))
+        self.assertEqual(o[4],six.b('2.2360679775'))
+
+    def test_power_function(self):
+        cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),5)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1.0'))
+        self.assertEqual(o[1],six.b('5.6568542495'))
+        self.assertEqual(o[2],six.b('15.5884572681'))
+        self.assertEqual(o[3],six.b('32.0'))
+        self.assertEqual(o[4],six.b('55.9016994375'))
+
+    def test_sha1_function(self):
+        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1,\'utf-8\') from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),4)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
+        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
+        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
+        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))
+
+    def test_sha_function(self):
+        cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
+        retcode, o, e = run_command(cmd)
+
+        self.assertEqual(retcode,0)
+        self.assertEqual(len(o),4)
+        self.assertEqual(len(e),0)
+
+        self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
+        self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
+        self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
+        self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
+
 
 class MultiHeaderTests(AbstractQTestCase):
     def test_output_header_when_multiple_input_headers_exist(self):