From e85c4c50a0c8b5d15ff4e2148f664c02a7543679 Mon Sep 17 00:00:00 2001 From: Harel Ben-Attia Date: Sun, 13 Sep 2020 17:29:53 +0300 Subject: [PATCH] option to list udfs, and added new functions --- bin/q.py | 180 +++++++++++++++++++++++++++++++++++++++---- mkdocs/docs/index.md | 8 +- test/test-suite | 146 ++++++++++++++++++++++++++++------- 3 files changed, 290 insertions(+), 44 deletions(-) diff --git a/bin/q.py b/bin/q.py index e2e9fb8d..5e165c85 100755 --- a/bin/q.py +++ b/bin/q.py @@ -31,6 +31,8 @@ from __future__ import division from __future__ import print_function +from collections import OrderedDict + q_version = '2.0.16' __all__ = [ 'QTextAsData' ] @@ -72,11 +74,26 @@ def get_stdout_encoding(encoding_override=None): SHOW_SQL = False -def sha1(data): - if not isinstance(data,str) and not isinstance(data,unicode): - return hashlib.sha1(str(data)).hexdigest() - return hashlib.sha1(data).hexdigest() +sha_algorithms = { + 1 : hashlib.sha1, + 224: hashlib.sha224, + 256: hashlib.sha256, + 386: hashlib.sha384, + 512: hashlib.sha512 +} + +def sha(data,algorithm,encoding): + try: + f = sha_algorithms[algorithm] + return f(six.text_type(data).encode(encoding)).hexdigest() + except Exception as e: + print(e) + +# For backward compatibility +def sha1(data,encoding): + return sha(data,1,encoding) +# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in def regexp(regular_expression, data): if data is not None: if not isinstance(data, str) and not isinstance(data, unicode): @@ -85,15 +102,16 @@ def regexp(regular_expression, data): else: return False -def md5(data,encoding='utf-8'): +def md5(data,encoding): m = hashlib.md5() m.update(six.text_type(data).encode(encoding)) return m.hexdigest() -class Sqlite3DBResults(object): - def __init__(self,query_column_names,results): - self.query_column_names = query_column_names - self.results = results +def sqrt(data): + return math.sqrt(data) + +def power(data,p): + return data**p def percentile(l, p): # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests @@ -106,6 +124,7 @@ def percentile(l, p): return l[int(k)] return (c-k) * l[int(f)] + (k-f) * l[int(c)] +# TODO Streaming Percentile to prevent memory consumption blowup for large datasets class StrictPercentile(object): def __init__(self): self.values = [] @@ -121,6 +140,130 @@ def finalize(self): else: return percentile(sorted(self.values),self.p) +class StdevPopulation(object): + def __init__(self): + self.M = 0.0 + self.S = 0.0 + self.k = 0 + + def step(self, value): + try: + # Ignore nulls + if value is None: + return + val = float(value) # if fails, skips this iteration, which also ignores nulls + tM = self.M + self.k += 1 + self.M += ((val - tM) / self.k) + self.S += ((val - tM) * (val - self.M)) + except ValueError: + # TODO propagate udf errors to console + raise Exception("Data is not numeric when calculating stddev (%s)" % value) + + def finalize(self): + if self.k <= 1: # avoid division by zero + return None + else: + return math.sqrt(self.S / (self.k)) + +class StdevSample(object): + def __init__(self): + self.M = 0.0 + self.S = 0.0 + self.k = 0 + + def step(self, value): + try: + # Ignore nulls + if value is None: + return + val = float(value) # if fails, skips this iteration, which also ignores nulls + tM = self.M + self.k += 1 + self.M += ((val - tM) / self.k) + self.S += ((val - tM) * (val - self.M)) + except ValueError: + # TODO propagate udf errors to console + raise Exception("Data is not numeric when calculating stddev (%s)" % value) + + def finalize(self): + if self.k <= 1: # avoid division by zero + return None + else: + return math.sqrt(self.S / (self.k-1)) + +class FunctionType(object): + REGULAR = 1 + AGG = 2 + +class UserFunctionDef(object): + def __init__(self,func_type,name,usage,description,func_or_obj,param_count): + self.func_type = func_type + self.name = name + self.usage = usage + self.description = description + self.func_or_obj = func_or_obj + self.param_count = param_count + +user_functions = [ + UserFunctionDef(FunctionType.REGULAR, + "regexp","regexp(,) = <1|0>", + "Find regexp in string expression. Returns 1 if found or 0 if not", + regexp, + 2), + UserFunctionDef(FunctionType.REGULAR, + "sha","sha(,,) = ", + "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.", + sha, + 3), + UserFunctionDef(FunctionType.REGULAR, + "sha1","sha1(,) = ", + "Calculate sha1 of some expression. For now encoding must be manually provided. Will be taken automatically from the input encoding in the future.", + sha1, + 2), + UserFunctionDef(FunctionType.REGULAR, + "md5","md5(,) = ", + "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.", + md5, + 2), + UserFunctionDef(FunctionType.REGULAR, + "sqrt","sqrt() = ", + "Calculate the square root of the expression", + sqrt, + 1), + UserFunctionDef(FunctionType.REGULAR, + "power","power(,) = ", + "Raise expr1 to the power of expr2", + power, + 2), + UserFunctionDef(FunctionType.AGG, + "percentile","percentile(,) = ", + "Calculate the strict percentile of a set of a values.", + StrictPercentile, + 2), + UserFunctionDef(FunctionType.AGG, + "stddev_pop","stddev_pop() = ", + "Calculate the population standard deviation of a set of values", + StdevPopulation, + 1), + UserFunctionDef(FunctionType.AGG, + "stddev_sample","stddev_sample() = ", + "Calculate the sample standard deviation of a set of values", + StdevSample, + 1) +] + +def print_user_functions(): + for udf in user_functions: + print("Function: %s" % udf.name) + print(" Usage: %s" % udf.usage) + print(" Description: %s" % udf.description) + +class Sqlite3DBResults(object): + def __init__(self,query_column_names,results): + self.query_column_names = query_column_names + self.results = results + class Sqlite3DB(object): def __init__(self, show_sql=SHOW_SQL): @@ -169,11 +312,13 @@ def store_db_to_disk(self,sqlite_db_filename,table_names_mapping,method='standar raise ValueError('Unknown store-db-to-disk method %s' % method) def add_user_functions(self): - self.conn.create_function("regexp", 2, regexp) - self.conn.create_function("sha1", 1, sha1) - self.conn.create_function("md5", 2, md5) - self.conn.create_function("md5", 1, md5) - self.conn.create_aggregate("percentile",2,StrictPercentile) + for udf in user_functions: + if type(udf.func_or_obj) == type(object): + self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj) + elif type(udf.func_or_obj) == type(md5): + self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj) + else: + raise Exception("Invalid user function definition %s" % str(udf)) def is_numeric_type(self, column_type): return column_type in self.numeric_column_types @@ -1791,6 +1936,8 @@ def get_option_with_default(p, option_type, option, default): help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding") output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal", help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.") + output_data_option_group.add_option("-L","--list-user-functions",dest="list_user_functions",default=False,action="store_true", + help="List all user functions") parser.add_option_group(output_data_option_group) #----------------------------------------------- query_option_group = OptionGroup(parser,"Query Related Options") @@ -1808,6 +1955,11 @@ def get_option_with_default(p, option_type, option, default): sys.exit(0) ### + + if options.list_user_functions: + print_user_functions() + sys.exit(0) + if len(args) == 0 and options.query_filename is None: print_credentials() print("Must provide at least one query in the command line, or through a file with the -q parameter", file=sys.stderr) diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md index e450b0a9..e2b37439 100644 --- a/mkdocs/docs/index.md +++ b/mkdocs/docs/index.md @@ -89,7 +89,7 @@ Usage: Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line. - Basic usage is q "" where table names are just regular file names (Use - to read from standard input) + Basic usage is q "" where table names are just regular file names (Use - to read from standard input) When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN. Column types are detected automatically. Use -A in order to see the column name/type analysis. @@ -133,6 +133,8 @@ Options: -d DELIMITER, --delimiter=DELIMITER Field delimiter. If none specified, then space is used as the delimiter. + -p, --pipe-delimited + Same as -d '|'. Added for convenience and readability -t, --tab-delimited Same as -d . Just a shorthand for handling standard tab delimited file You can use $'\t' if you @@ -186,6 +188,8 @@ Options: Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified + -P, --pipe-delimited-output + Same as -D '|'. Added for convenience and readability. -T, --tab-delimited-output Same as -D . Just a shorthand for outputting tab delimited output. You can use -D $'\t' if you want. @@ -210,6 +214,8 @@ Options: nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting. + -L, --list-user-functions + List all user functions Query Related Options: -q QUERY_FILENAME, --query-filename=QUERY_FILENAME diff --git a/test/test-suite b/test/test-suite index 4f1f877b..a5837701 100755 --- a/test/test-suite +++ b/test/test-suite @@ -11,7 +11,6 @@ # import unittest -import pytest import random import json from json import JSONEncoder @@ -283,34 +282,6 @@ class BasicTests(AbstractQTestCase): self.cleanup(tmpfile) - def test_regexp_int_data_handling(self): - tmpfile = self.create_file_with_data(sample_data_no_header) - - cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name - retcode, o, e = run_command(cmd) - - self.assertEqual(retcode, 0) - self.assertEqual(len(o), 1) - self.assertEqual(len(e), 0) - - self.assertEqual(o[0],six.b("1")) - - self.cleanup(tmpfile) - - def test_regexp_null_data_handling(self): - tmpfile = self.create_file_with_data(sample_data_no_header) - - cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name - retcode, o, e = run_command(cmd) - - self.assertEqual(retcode, 0) - self.assertEqual(len(o), 1) - self.assertEqual(len(e), 0) - - self.assertEqual(o[0],six.b("2")) - - self.cleanup(tmpfile) - def test_select_one_column(self): tmpfile = self.create_file_with_data(sample_data_no_header) @@ -1525,6 +1496,55 @@ class BasicTests(AbstractQTestCase): class UserFunctionTests(AbstractQTestCase): + def test_regexp_int_data_handling(self): + tmpfile = self.create_file_with_data(sample_data_no_header) + + cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode, 0) + self.assertEqual(len(o), 1) + self.assertEqual(len(e), 0) + + self.assertEqual(o[0],six.b("1")) + + self.cleanup(tmpfile) + + def test_percentile_func(self): + cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode, 0) + self.assertEqual(len(o), 10) + self.assertEqual(len(e), 0) + + output_table = [l.split(six.b(" ")) for l in o] + group_labels = [int(row[0]) for row in output_table] + minimum_values = [float(row[1]) for row in output_table] + median_values = [float(row[2]) for row in output_table] + max_values = [float(row[3]) for row in output_table] + + base_values = list(range(1000,2000,100)) + + self.assertEqual(group_labels,list(range(10,20))) + self.assertEqual(minimum_values,base_values) + self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values))) + self.assertEqual(max_values,list(map(lambda x: x + 99,base_values))) + + def test_regexp_null_data_handling(self): + tmpfile = self.create_file_with_data(sample_data_no_header) + + cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode, 0) + self.assertEqual(len(o), 1) + self.assertEqual(len(e), 0) + + self.assertEqual(o[0],six.b("2")) + + self.cleanup(tmpfile) + def test_md5_function(self): cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE retcode, o, e = run_command(cmd) @@ -1538,6 +1558,74 @@ class UserFunctionTests(AbstractQTestCase): self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3'))) self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c'))) + def test_stddev_functions(self): + tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65])))) + + cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name) + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode,0) + self.assertEqual(len(o),1) + self.assertEqual(len(e),0) + + self.assertEqual(o[0],'1479.7015464838,1569.4604964764') + + self.cleanup(tmpfile) + + def test_sqrt_function(self): + cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode,0) + self.assertEqual(len(o),5) + self.assertEqual(len(e),0) + + self.assertEqual(o[0],six.b('1.0')) + self.assertEqual(o[1],six.b('1.4142135624')) + self.assertEqual(o[2],six.b('1.7320508076')) + self.assertEqual(o[3],six.b('2.0')) + self.assertEqual(o[4],six.b('2.2360679775')) + + def test_power_function(self): + cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode,0) + self.assertEqual(len(o),5) + self.assertEqual(len(e),0) + + self.assertEqual(o[0],six.b('1.0')) + self.assertEqual(o[1],six.b('5.6568542495')) + self.assertEqual(o[2],six.b('15.5884572681')) + self.assertEqual(o[3],six.b('32.0')) + self.assertEqual(o[4],six.b('55.9016994375')) + + def test_sha1_function(self): + cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1,\'utf-8\') from -"' % Q_EXECUTABLE + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode,0) + self.assertEqual(len(o),4) + self.assertEqual(len(e),0) + + self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab')) + self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0')) + self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb')) + self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a')) + + def test_sha_function(self): + cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode,0) + self.assertEqual(len(o),4) + self.assertEqual(len(e),0) + + self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b')) + self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35')) + self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce')) + self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a')) + class MultiHeaderTests(AbstractQTestCase): def test_output_header_when_multiple_input_headers_exist(self):