From 11fdf0bbae0dda465163251370c5e2212c24c6b0 Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Sat, 10 Feb 2024 10:10:56 -0500 Subject: [PATCH 1/2] make column extensions normal functions --- quinn/__init__.py | 14 ++++ quinn/extensions/column_ext.py | 105 ------------------------- quinn/functions.py | 90 +++++++++++++++++++++ tests/extensions/test_column_ext.py | 118 ---------------------------- tests/test_functions.py | 110 ++++++++++++++++++++++++++ 5 files changed, 214 insertions(+), 223 deletions(-) delete mode 100644 quinn/extensions/column_ext.py delete mode 100644 tests/extensions/test_column_ext.py diff --git a/quinn/__init__.py b/quinn/__init__.py index 25dc8d8c..727d5517 100644 --- a/quinn/__init__.py +++ b/quinn/__init__.py @@ -31,6 +31,13 @@ uuid5, week_end_date, week_start_date, + is_falsy, + is_truthy, + is_false, + is_true, + is_null_or_blank, + is_not_in, + null_between, ) from quinn.schema_helpers import print_schema_as_code from quinn.split_columns import split_col @@ -78,4 +85,11 @@ "sort_columns", "append_if_schema_identical", "flatten_dataframe", + "is_falsy", + "is_truthy", + "is_false", + "is_true", + "is_null_or_blank", + "is_not_in", + "null_between", ] diff --git a/quinn/extensions/column_ext.py b/quinn/extensions/column_ext.py deleted file mode 100644 index f44ea006..00000000 --- a/quinn/extensions/column_ext.py +++ /dev/null @@ -1,105 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from pyspark.sql.column import Column -from pyspark.sql.functions import lit, trim, when - - -def isFalsy(self: Column) -> Column: - """Returns a Column indicating whether all values in the Column are False or NULL (**falsy**). - - Each element in the resulting column is True if all the elements in the - Column are either NULL or False, or False otherwise. This is accomplished by - performing a bitwise or of the ``isNull`` condition and a literal False value and - then wrapping the result in a **when** statement. - - :param self: Column object - :returns: Column object - :rtype: Column - """ - return when(self.isNull() | (self == lit(False)), True).otherwise(False) - - -def isTruthy(self: Column) -> Column: - """Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` self. - - :param Column self: The ``Column`` to calculate the opposite of isFalsy for. - :returns: A ``Column`` with the results of the calculation. - :rtype: Column - """ - return ~(self.isFalsy()) - - -def isFalse(self: Column) -> Column: - """Function checks if the column is equal to False and returns the column. - - :param self: Column - :return: Column - :rtype: Column - """ - return self == lit(False) - - -def isTrue(self: Column) -> Column: - """Function takes a column of type Column as an argument and returns a column of type Column. - - It evaluates whether each element in the column argument is equal to True, and - if so will return True, otherwise False. - - :param self: Column object - :returns: Column object - :rtype: Column - """ - return self == lit(True) - - -def isNullOrBlank(self: Column) -> Column: - r"""Returns a Boolean value which expresses whether a given column is ``null`` or contains only blank characters. - - :param \*\*self: The :class:`Column` to check. - - :returns: A `Column` containing ``True`` if the column is ``null`` or only contains - blank characters, or ``False`` otherwise. - :rtype: Column - """ - return (self.isNull()) | (trim(self) == "") - - -def isNotIn(self: Column, _list: list[Any]) -> Column: - """To see if a value is not in a list of values. - - :param self: Column object - :_list: list[Any] - :rtype: Column - """ - return ~(self.isin(_list)) - - -def nullBetween(self: Column, lower: Column, upper: Column) -> Column: - """To see if a value is between two values in a null friendly way. - - :param self: Column object - :lower: Column - :upper: Column - :rtype: Column - """ - return when(lower.isNull() & upper.isNull(), False).otherwise( - when(self.isNull(), False).otherwise( - when(lower.isNull() & upper.isNotNull() & (self <= upper), True).otherwise( - when( - lower.isNotNull() & upper.isNull() & (self >= lower), - True, - ).otherwise(self.between(lower, upper)), - ), - ), - ) - - -Column.isFalsy = isFalsy -Column.isTruthy = isTruthy -Column.isFalse = isFalse -Column.isTrue = isTrue -Column.isNullOrBlank = isNullOrBlank -Column.isNotIn = isNotIn -Column.nullBetween = nullBetween diff --git a/quinn/functions.py b/quinn/functions.py index 9f62bc8a..e658a0bd 100644 --- a/quinn/functions.py +++ b/quinn/functions.py @@ -14,6 +14,7 @@ import uuid from typing import Any +from pyspark.sql.functions import lit, trim, when import pyspark.sql.functions as F # noqa: N812 from pyspark.sql.types import ( ArrayType, @@ -302,3 +303,92 @@ def uuid5( variant_part, F.substring(hashed, 21, 12), ) + +def is_falsy(col: Column) -> Column: + """Returns a Column indicating whether all values in the Column are False or NULL (**falsy**). + + Each element in the resulting column is True if all the elements in the + Column are either NULL or False, or False otherwise. This is accomplished by + performing a bitwise or of the ``isNull`` condition and a literal False value and + then wrapping the result in a **when** statement. + + :param col: Column object + :returns: Column object + :rtype: Column + """ + return when(col.isNull() | (col == lit(False)), True).otherwise(False) + + +def is_truthy(col: Column) -> Column: + """Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` col. + + :param Column col: The ``Column`` to calculate the opposite of isFalsy for. + :returns: A ``Column`` with the results of the calculation. + :rtype: Column + """ + return ~(is_falsy(col)) + + +def is_false(col: Column) -> Column: + """Function checks if the column is equal to False and returns the column. + + :param col: Column + :return: Column + :rtype: Column + """ + return col == lit(False) + + +def is_true(col: Column) -> Column: + """Function takes a column of type Column as an argument and returns a column of type Column. + + It evaluates whether each element in the column argument is equal to True, and + if so will return True, otherwise False. + + :param col: Column object + :returns: Column object + :rtype: Column + """ + return col == lit(True) + + +def is_null_or_blank(col: Column) -> Column: + r"""Returns a Boolean value which expresses whether a given column is ``null`` or contains only blank characters. + + :param \*\*col: The :class:`Column` to check. + + :returns: A `Column` containing ``True`` if the column is ``null`` or only contains + blank characters, or ``False`` otherwise. + :rtype: Column + """ + return (col.isNull()) | (trim(col) == "") + + +def is_not_in(col: Column, _list: list[Any]) -> Column: + """To see if a value is not in a list of values. + + :param col: Column object + :_list: list[Any] + :rtype: Column + """ + return ~(col.isin(_list)) + + +def null_between(col: Column, lower: Column, upper: Column) -> Column: + """To see if a value is between two values in a null friendly way. + + :param col: Column object + :lower: Column + :upper: Column + :rtype: Column + """ + return when(lower.isNull() & upper.isNull(), False).otherwise( + when(col.isNull(), False).otherwise( + when(lower.isNull() & upper.isNotNull() & (col <= upper), True).otherwise( + when( + lower.isNotNull() & upper.isNull() & (col >= lower), + True, + ).otherwise(col.between(lower, upper)), + ), + ), + ) \ No newline at end of file diff --git a/tests/extensions/test_column_ext.py b/tests/extensions/test_column_ext.py deleted file mode 100644 index 267e8d86..00000000 --- a/tests/extensions/test_column_ext.py +++ /dev/null @@ -1,118 +0,0 @@ -import pyspark.sql.functions as F -from pyspark.sql.types import StringType, BooleanType, IntegerType -import quinn -import chispa - -from ..spark import spark -from quinn.extensions import * # noqa - - -def test_is_falsy(): - source_df = quinn.create_df( - spark, - [(True, False), (False, True), (None, True)], - [ - ("has_stuff", BooleanType(), True), - ("expected", BooleanType(), True), - ], - ) - actual_df = source_df.withColumn("is_has_stuff_falsy", F.col("has_stuff").isFalsy()) - chispa.assert_column_equality(actual_df, "is_has_stuff_falsy", "expected") - - -def test_is_truthy(): - source_df = quinn.create_df( - spark, - [(True, True), (False, False), (None, False)], - [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], - ) - actual_df = source_df.withColumn( - "is_has_stuff_truthy", F.col("has_stuff").isTruthy() - ) - chispa.assert_column_equality(actual_df, "is_has_stuff_truthy", "expected") - - -def test_is_false(): - source_df = quinn.create_df( - spark, - [(True, False), (False, True), (None, None)], - [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], - ) - actual_df = source_df.withColumn("is_has_stuff_false", F.col("has_stuff").isFalse()) - chispa.assert_column_equality(actual_df, "is_has_stuff_false", "expected") - - -def test_is_true(): - source_df = quinn.create_df( - spark, - [(True, True), (False, False), (None, None)], - [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], - ) - actual_df = source_df.withColumn("is_stuff_true", F.col("has_stuff").isTrue()) - chispa.assert_column_equality(actual_df, "is_stuff_true", "expected") - - -def test_is_null_or_blank(): - source_df = quinn.create_df( - spark, - [ - ("", True), - (" ", True), - (None, True), - ("hi", False), - ], - [ - ("blah", StringType(), True), - ("expected", BooleanType(), True), - ], - ) - actual_df = source_df.withColumn( - "is_blah_null_or_blank", F.col("blah").isNullOrBlank() - ) - chispa.assert_column_equality(actual_df, "is_blah_null_or_blank", "expected") - - -def test_is_not_in(): - source_df = quinn.create_df( - spark, - [ - ("surfing", True), - ("swimming", True), - ("dancing", False), - ], - [ - ("fun_thing", StringType(), True), - ("expected", BooleanType(), True), - ], - ) - bobs_hobbies = ["dancing", "snowboarding"] - actual_df = source_df.withColumn( - "is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies) - ) - chispa.assert_column_equality(actual_df, "is_not_bobs_hobby", "expected") - - -def test_null_between(): - source_df = quinn.create_df( - spark, - [ - (17, None, 94, True), - (17, None, 10, False), - (None, 10, 5, True), - (None, 10, 88, False), - (10, 15, 11, True), - (None, None, 11, False), - (3, 5, None, False), - (None, None, None, False), - ], - [ - ("lower_age", IntegerType(), True), - ("upper_age", IntegerType(), True), - ("age", IntegerType(), True), - ("expected", BooleanType(), True), - ], - ) - actual_df = source_df.withColumn( - "is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")) - ) - chispa.assert_column_equality(actual_df, "is_between", "expected") diff --git a/tests/test_functions.py b/tests/test_functions.py index a0065a44..4a9aa3b5 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -414,3 +414,113 @@ def test_with_extra_string(): ), ) chispa.assert_column_equality(actual_df, "uuid5_of_s1", "expected") + +def test_is_falsy(): + source_df = quinn.create_df( + spark, + [(True, False), (False, True), (None, True)], + [ + ("has_stuff", BooleanType(), True), + ("expected", BooleanType(), True), + ], + ) + actual_df = source_df.withColumn("is_has_stuff_falsy", quinn.is_falsy(F.col("has_stuff"))) + chispa.assert_column_equality(actual_df, "is_has_stuff_falsy", "expected") + + +def test_is_truthy(): + source_df = quinn.create_df( + spark, + [(True, True), (False, False), (None, False)], + [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], + ) + actual_df = source_df.withColumn( + "is_has_stuff_truthy", quinn.is_truthy(F.col("has_stuff")) + ) + chispa.assert_column_equality(actual_df, "is_has_stuff_truthy", "expected") + + +def test_is_false(): + source_df = quinn.create_df( + spark, + [(True, False), (False, True), (None, None)], + [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], + ) + actual_df = source_df.withColumn("is_has_stuff_false", quinn.is_false(F.col("has_stuff"))) + chispa.assert_column_equality(actual_df, "is_has_stuff_false", "expected") + + +def test_is_true(): + source_df = quinn.create_df( + spark, + [(True, True), (False, False), (None, None)], + [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], + ) + actual_df = source_df.withColumn("is_stuff_true", quinn.is_true(F.col("has_stuff"))) + chispa.assert_column_equality(actual_df, "is_stuff_true", "expected") + + +def test_is_null_or_blank(): + source_df = quinn.create_df( + spark, + [ + ("", True), + (" ", True), + (None, True), + ("hi", False), + ], + [ + ("blah", StringType(), True), + ("expected", BooleanType(), True), + ], + ) + actual_df = source_df.withColumn( + "is_blah_null_or_blank", quinn.is_null_or_blank(F.col("blah")) + ) + chispa.assert_column_equality(actual_df, "is_blah_null_or_blank", "expected") + + +def test_is_not_in(): + source_df = quinn.create_df( + spark, + [ + ("surfing", True), + ("swimming", True), + ("dancing", False), + ], + [ + ("fun_thing", StringType(), True), + ("expected", BooleanType(), True), + ], + ) + bobs_hobbies = ["dancing", "snowboarding"] + actual_df = source_df.withColumn( + "is_not_bobs_hobby", quinn.is_not_in(F.col("fun_thing"), (bobs_hobbies)) + ) + chispa.assert_column_equality(actual_df, "is_not_bobs_hobby", "expected") + + +def test_null_between(): + source_df = quinn.create_df( + spark, + [ + (17, None, 94, True), + (17, None, 10, False), + (None, 10, 5, True), + (None, 10, 88, False), + (10, 15, 11, True), + (None, None, 11, False), + (3, 5, None, False), + (None, None, None, False), + ], + [ + ("lower_age", IntegerType(), True), + ("upper_age", IntegerType(), True), + ("age", IntegerType(), True), + ("expected", BooleanType(), True), + ], + ) + actual_df = source_df.withColumn( + "is_between", quinn.null_between(F.col("age"), F.col("lower_age"), F.col("upper_age")) + ) + chispa.assert_column_equality(actual_df, "is_between", "expected") From c2e581a159521b42d406a2393b8251ce8d7ecace Mon Sep 17 00:00:00 2001 From: Matthew Powers Date: Sat, 10 Feb 2024 10:12:46 -0500 Subject: [PATCH 2/2] remove regexp_extract_all --- quinn/__init__.py | 2 -- quinn/functions.py | 14 -------------- tests/test_functions.py | 14 -------------- 3 files changed, 30 deletions(-) diff --git a/quinn/__init__.py b/quinn/__init__.py index 727d5517..3f6cbd24 100644 --- a/quinn/__init__.py +++ b/quinn/__init__.py @@ -24,7 +24,6 @@ exists, forall, multi_equals, - regexp_extract_all, remove_all_whitespace, remove_non_word_characters, single_space, @@ -75,7 +74,6 @@ "week_start_date", "week_end_date", "approx_equal", - "regexp_extract_all", "business_days_between", "uuid5", "with_columns_renamed", diff --git a/quinn/functions.py b/quinn/functions.py index e658a0bd..d09a0818 100644 --- a/quinn/functions.py +++ b/quinn/functions.py @@ -230,20 +230,6 @@ def array_choice(col: Column, seed: int | None = None) -> Column: return col[index] -@F.udf(returnType=ArrayType(StringType())) -def regexp_extract_all(s: Column, regexp: Column) -> Column: - """Function uses the Python `re` library to extract regular expressions from a string (`s`) using a regex pattern (`regexp`). - - It returns a list of all matches, or `None` if `s` is `None`. - - :param s: input string (`Column`) - :type s: str - :param regexp: string `re` pattern - :rtype: Column - """ - return None if s is None else re.findall(regexp, s) - - def business_days_between( start_date: Column, end_date: Column, # noqa: ARG001 ) -> Column: diff --git a/tests/test_functions.py b/tests/test_functions.py index 4a9aa3b5..aaafda6c 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -329,20 +329,6 @@ def it_works_with_integer_values(): # chispa.assert_column_equality(actual_df, "random_letter", "expected") -def test_regexp_extract_all(): - df = quinn.create_df( - spark, - [("200 - 300 PA.", ["200", "300"]), ("400 PA.", ["400"]), (None, None)], - [ - ("str", StringType(), True), - ("expected", ArrayType(StringType(), True), True), - ], - ) - actual_df = df.withColumn( - "all_numbers", quinn.regexp_extract_all(F.col("str"), F.lit(r"(\d+)")) - ) - chispa.assert_column_equality(actual_df, "all_numbers", "expected") - def test_business_days_between(): df = quinn.create_df(