From 762e08f787c9af8cd29d0fde0d481d5bd3a5570d Mon Sep 17 00:00:00 2001 From: Nate Parsons <4307001+thehomebrewnerd@users.noreply.github.com> Date: Wed, 8 May 2024 09:16:33 -0500 Subject: [PATCH] Remove support for Dask and Pyspark Dataframes (#1857) * initial cleanup * mass cleanup * getting close * add pyarrow to docs reqs * update release notes * fix pr number * update test names * remove more unused code * fix release notes --- .github/workflows/build_docs.yaml | 6 +- .github/workflows/install_test.yaml | 3 - .../workflows/latest_dependency_checker.yaml | 19 - .../workflows/minimum_dependency_checker.yaml | 16 - .github/workflows/tests_with_latest_deps.yaml | 61 +- .../workflows/tests_with_minimum_deps.yaml | 12 +- .readthedocs.yaml | 5 - Makefile | 2 +- contributing.md | 30 +- .../custom_types_and_type_inference.ipynb | 4 +- docs/source/guides/guides_index.rst | 1 - .../logical_types_and_semantic_tags.ipynb | 71 +-- .../using_woodwork_with_dask_and_spark.ipynb | 354 ----------- .../guides/working_with_types_and_tags.ipynb | 4 +- docs/source/install.md | 68 +- docs/source/release_notes.rst | 6 + pyproject.toml | 15 +- woodwork/accessor_utils.py | 40 +- woodwork/column_accessor.py | 20 +- woodwork/deserialize.py | 2 +- woodwork/deserializers/csv_deserializer.py | 5 +- woodwork/deserializers/deserializer_base.py | 30 +- .../deserializers/feather_deserializer.py | 5 +- woodwork/deserializers/orc_deserializer.py | 5 +- .../deserializers/parquet_deserializer.py | 19 +- woodwork/deserializers/pickle_deserializer.py | 5 +- woodwork/deserializers/utils.py | 2 +- woodwork/indexers.py | 12 +- woodwork/logical_types.py | 119 +--- woodwork/serializers/csv_serializer.py | 22 +- woodwork/serializers/orc_serializer.py | 5 - woodwork/serializers/parquet_serializer.py | 86 +-- woodwork/serializers/pickle_serializer.py | 6 - woodwork/serializers/serializer_base.py | 16 +- .../_get_box_plot_info_for_column.py | 10 - .../statistics_utils/_get_dependence_dict.py | 16 +- .../statistics_utils/_get_describe_dict.py | 25 +- .../statistics_utils/_get_value_counts.py | 18 +- woodwork/table_accessor.py | 67 +- .../tests/accessor/test_column_accessor.py | 139 +--- woodwork/tests/accessor/test_indexers.py | 55 +- woodwork/tests/accessor/test_serialization.py | 467 ++++---------- woodwork/tests/accessor/test_statistics.py | 194 ++---- .../tests/accessor/test_table_accessor.py | 436 ++++--------- woodwork/tests/conftest.py | 600 +----------------- .../tests/logical_types/test_logical_types.py | 204 +----- .../latest_dask_dependencies.txt | 7 - .../latest_spark_dependencies.txt | 5 - .../minimum_dask_requirements.txt | 7 - .../minimum_spark_requirements.txt | 8 - woodwork/tests/testing_utils/__init__.py | 14 - woodwork/tests/testing_utils/table_utils.py | 67 -- woodwork/tests/type_system/conftest.py | 321 +--------- .../tests/type_system/test_ltype_inference.py | 79 +-- woodwork/tests/utils/test_accessor_utils.py | 88 +-- woodwork/tests/utils/test_concat.py | 55 +- woodwork/tests/utils/test_read_file.py | 22 +- woodwork/tests/utils/test_utils.py | 92 +-- woodwork/type_sys/type_system.py | 41 +- woodwork/type_sys/utils.py | 12 - woodwork/typing.py | 11 +- woodwork/utils.py | 53 +- 62 files changed, 620 insertions(+), 3569 deletions(-) delete mode 100644 docs/source/guides/using_woodwork_with_dask_and_spark.ipynb delete mode 100644 woodwork/tests/requirement_files/latest_dask_dependencies.txt delete mode 100644 woodwork/tests/requirement_files/latest_spark_dependencies.txt delete mode 100644 woodwork/tests/requirement_files/minimum_dask_requirements.txt delete mode 100644 woodwork/tests/requirement_files/minimum_spark_requirements.txt diff --git a/.github/workflows/build_docs.yaml b/.github/workflows/build_docs.yaml index 2c71edc37..c0b536459 100644 --- a/.github/workflows/build_docs.yaml +++ b/.github/workflows/build_docs.yaml @@ -7,15 +7,14 @@ on: - main env: PYARROW_IGNORE_TIMEZONE: 1 - JAVA_HOME: "/usr/lib/jvm/java-11-openjdk-amd64" ALTERYX_OPEN_SRC_UPDATE_CHECKER: False jobs: build_docs: - name: 3.9 build docs + name: ${{ matrix.python_version }} Build Docs runs-on: ubuntu-latest strategy: matrix: - python_version: ["3.9"] + python_version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout repository uses: actions/checkout@v3 @@ -37,7 +36,6 @@ jobs: run: | sudo apt update sudo apt install -y pandoc - sudo apt install -y openjdk-11-jre-headless python -m pip install --upgrade pip - name: Install woodwork with doc dependencies (not using cache) if: steps.cache.outputs.cache-hit != 'true' diff --git a/.github/workflows/install_test.yaml b/.github/workflows/install_test.yaml index e99aaaf76..e8108de46 100644 --- a/.github/workflows/install_test.yaml +++ b/.github/workflows/install_test.yaml @@ -51,6 +51,3 @@ jobs: - name: Check package conflicts run: | python -m pip check - - name: Verify extra_requires commands - run: | - python -m pip install "unpacked_sdist/[dask,spark]" diff --git a/.github/workflows/latest_dependency_checker.yaml b/.github/workflows/latest_dependency_checker.yaml index c63accc32..702b59631 100644 --- a/.github/workflows/latest_dependency_checker.yaml +++ b/.github/workflows/latest_dependency_checker.yaml @@ -27,25 +27,6 @@ jobs: python -m pip install .[test] make checkdeps OUTPUT_FILEPATH=woodwork/tests/requirement_files/latest_core_dependencies.txt cat woodwork/tests/requirement_files/latest_core_dependencies.txt - - name: Update latest spark dependencies - run: | - python -m virtualenv venv_spark - source venv_spark/bin/activate - python -m pip install --upgrade pip - python -m pip install .[spark,test] - make checkdeps OUTPUT_FILEPATH=woodwork/tests/requirement_files/latest_spark_dependencies.txt - cat woodwork/tests/requirement_files/latest_spark_dependencies.txt - - name: Update latest dask dependencies - run: | - python -m virtualenv venv_dask - source venv_dask/bin/activate - python -m pip install --upgrade pip - python -m pip install .[test,dask] - make checkdeps OUTPUT_FILEPATH=woodwork/tests/requirement_files/latest_dask_dependencies.txt - cat woodwork/tests/requirement_files/latest_dask_dependencies.txt - python -m pip install .[dev] - make lint-fix - pre-commit autoupdate - name: Create Pull Request uses: peter-evans/create-pull-request@v3 with: diff --git a/.github/workflows/minimum_dependency_checker.yaml b/.github/workflows/minimum_dependency_checker.yaml index cda63b2c1..a489c92ca 100644 --- a/.github/workflows/minimum_dependency_checker.yaml +++ b/.github/workflows/minimum_dependency_checker.yaml @@ -29,22 +29,6 @@ jobs: paths: 'pyproject.toml' options: 'dependencies' output_filepath: 'woodwork/tests/requirement_files/minimum_core_requirements.txt' - - name: Run min dep generator - core + spark reqs - id: min_dep_gen_spark - uses: alteryx/minimum-dependency-generator@v3 - with: - paths: 'pyproject.toml' - options: 'dependencies' - extras_require: 'spark' - output_filepath: 'woodwork/tests/requirement_files/minimum_spark_requirements.txt' - - name: Run min dep generator - core + dask - id: min_dep_gen_dask - uses: alteryx/minimum-dependency-generator@v3 - with: - paths: 'pyproject.toml' - options: 'dependencies' - extras_require: 'dask' - output_filepath: 'woodwork/tests/requirement_files/minimum_dask_requirements.txt' - name: Create Pull Request uses: peter-evans/create-pull-request@v3 with: diff --git a/.github/workflows/tests_with_latest_deps.yaml b/.github/workflows/tests_with_latest_deps.yaml index f77e28c58..39ba58fe7 100644 --- a/.github/workflows/tests_with_latest_deps.yaml +++ b/.github/workflows/tests_with_latest_deps.yaml @@ -8,17 +8,16 @@ on: workflow_dispatch: env: PYARROW_IGNORE_TIMEZONE: 1 - JAVA_HOME: "/usr/lib/jvm/java-11-openjdk-amd64" ALTERYX_OPEN_SRC_UPDATE_CHECKER: False jobs: unit_latest_tests: - name: ${{ matrix.python_version }} ${{ matrix.directories }} unit tests + name: ${{ matrix.python_version }} ${{ matrix.directories }} Unit Tests runs-on: ubuntu-latest strategy: fail-fast: true matrix: python_version: ["3.9", "3.10", "3.11", "3.12"] - directories: ["Core", "Dask/Spark - All Other Tests", "Dask/Spark - Testing Table Accessor", "Dask/Spark - Testing to Disk with LatLong", "Dask/Spark - All other Serialization"] + directories: ["Core"] steps: - name: Set up python ${{ matrix.python_version }} uses: actions/setup-python@v4 @@ -38,61 +37,17 @@ jobs: - name: Install woodwork with test requirements run: | python -m pip install -e unpacked_sdist/[test] - - if: ${{ startsWith(matrix.directories, 'Dask/Spark') }} - name: Install Dask and Spark Requirements + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Core' }} + name: Run Unit Tests with core requirements with code coverage run: | - sudo apt update - sudo apt install -y openjdk-11-jre-headless - python -m pip install unpacked_sdist/[spark] - python -m pip install unpacked_sdist/[dask] cd unpacked_sdist - coverage erase - - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Dask/Spark - Testing to Disk with LatLong' }} - name: Run testing to Disk with LatLong Unit Tests (no code coverage) - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Dask/Spark - All other Serialization' }} - name: Run all other Serialization Unit Tests (no code coverage) - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Dask/Spark - Testing Table Accessor' }} - name: Run Table Accessor Unit Tests (no code coverage) - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_table_accessor.py -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Dask/Spark - All Other Tests' }} - name: Run all other Unit Tests (no code coverage) - run: | - cd unpacked_sdist - pytest woodwork/ -n 2 --ignore=woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_table_accessor.py --durations 0 - - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Dask/Spark - Testing to Disk with LatLong' }} - name: Run Testing to Disk with LatLong Unit Tests with code coverage - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Dask/Spark - All other Serialization' }} - name: Run all other Serialization Unit Tests with code coverage - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Dask/Spark - Testing Table Accessor' }} - name: Run Table Accessor Unit Tests with code coverage - run: | - cd unpacked_sdist - pytest woodwork/tests/accessor/test_table_accessor.py -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Dask/Spark - All Other Tests' }} - name: Run all other Unit Tests with code coverage - run: | - cd unpacked_sdist - pytest woodwork/ -n 2 --ignore=woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_table_accessor.py --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.directories == 'Core' }} - name: Run Unit Tests with core requirements only (no code coverage) + pytest woodwork/ -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml + - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Core' }} + name: Run Unit Tests with core requirements without code coverage run: | cd unpacked_sdist pytest woodwork/ -n 2 - - if: ${{ matrix.python_version == 3.9 && matrix.directories != 'Core' }} + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Core' }} name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: diff --git a/.github/workflows/tests_with_minimum_deps.yaml b/.github/workflows/tests_with_minimum_deps.yaml index 36fb228c4..d32f112ab 100644 --- a/.github/workflows/tests_with_minimum_deps.yaml +++ b/.github/workflows/tests_with_minimum_deps.yaml @@ -6,12 +6,12 @@ on: branches: - main jobs: - py38_unit_tests_minimum_dependencies: + py39_unit_tests_minimum_dependencies: name: Tests - 3.9 Minimum Dependencies runs-on: ubuntu-latest strategy: matrix: - libraries: ["core", "dask", "spark", "min_min"] + libraries: ["core"] steps: - name: Checkout repository uses: actions/checkout@v3 @@ -26,14 +26,6 @@ jobs: run: | python -m pip install -e . --no-dependencies python -m pip install -r woodwork/tests/requirement_files/minimum_test_requirements.txt - - if: ${{ matrix.libraries == 'spark' }} - name: Install woodwork - minimum spark, core requirements - run: | - python -m pip install -r woodwork/tests/requirement_files/minimum_spark_requirements.txt - - if: ${{ matrix.libraries == 'dask' }} - name: Install woodwork - minimum dask, core requirements - run: | - python -m pip install -r woodwork/tests/requirement_files/minimum_dask_requirements.txt - if: ${{ matrix.libraries == 'core' }} name: Install woodwork - minimum core requirements run: | diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 90dbda93f..f59b78877 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,11 +17,6 @@ build: os: "ubuntu-22.04" tools: python: "3.9" - apt_packages: - - openjdk-11-jre-headless - jobs: - post_build: - - export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" python: install: diff --git a/Makefile b/Makefile index b70ef1f6d..2c3701968 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ installdeps-test: upgradepip .PHONY: checkdeps checkdeps: - $(eval allow_list='numpy|pandas|scikit|click|pyarrow|distributed|dask|pyspark') + $(eval allow_list='numpy|pandas|scikit|click|pyarrow') pip freeze | grep -v "woodwork.git" | grep -E $(allow_list) > $(OUTPUT_FILEPATH) .PHONY: upgradepip diff --git a/contributing.md b/contributing.md index e90c21308..50762d799 100644 --- a/contributing.md +++ b/contributing.md @@ -24,40 +24,18 @@ Whether you are a novice or experienced software developer, all contributions an make installdeps-dev git checkout -b issue####-branch_name ``` -* You will need to install Spark, Scala, and Pandoc to run all unit tests & build docs: - - > If you do not install Spark/Scala, you can still run the unit tests (the Spark tests will be skipped). +* You will need to install Pandoc to run all unit tests & build docs: > Pandoc is only needed to build the documentation locally. - **macOS (Intel)** (use [Homebrew](https://brew.sh/)): - ```console - brew tap AdoptOpenJDK/openjdk - brew install --cask adoptopenjdk11 - brew install scala apache-spark pandoc - echo 'export JAVA_HOME=$(/usr/libexec/java_home)' >> ~/.zshrc - echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc - ``` - **macOS (M1)** (use [Homebrew](https://brew.sh/)): + **macOS** (use [Homebrew](https://brew.sh/)): ```console - brew install openjdk@11 scala apache-spark pandoc - echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc - echo 'export CPPFLAGS="-I/opt/homebrew/opt/openjdk@11/include:$CPPFLAGS"' >> ~/.zprofile - sudo ln -sfn /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk-11.jdk + brew install pandoc ``` **Ubuntu**: ```console - sudo apt install openjdk-11-jre openjdk-11-jdk scala pandoc -y - echo "export SPARK_HOME=/opt/spark" >> ~/.profile - echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.profile - echo "export PYSPARK_PYTHON=/usr/bin/python3" >> ~/.profile - ``` - - **Amazon Linux**: - ```console - sudo amazon-linux-extras install java-openjdk11 scala -y - amazon-linux-extras enable java-openjdk11 + sudo apt install pandoc -y ``` #### 2. Implement your Pull Request diff --git a/docs/source/guides/custom_types_and_type_inference.ipynb b/docs/source/guides/custom_types_and_type_inference.ipynb index 8e518f870..a14013803 100644 --- a/docs/source/guides/custom_types_and_type_inference.ipynb +++ b/docs/source/guides/custom_types_and_type_inference.ipynb @@ -53,7 +53,6 @@ " \"\"\"Represents Logical Types that contain 12-digit UPC Codes.\"\"\"\n", "\n", " primary_dtype = \"category\"\n", - " pyspark_dtype = \"string\"\n", " standard_tags = {\"category\", \"upc_code\"}" ] }, @@ -64,7 +63,6 @@ "When defining the `UPCCode` LogicalType class, three class attributes were set. All three of these attributes are optional, and will default to the values defined on the `LogicalType` class if they are not set when defining the new type.\n", "\n", "- `primary_dtype`: This value specifies how the data will be stored. If the column of the dataframe is not already of this type, Woodwork will convert the data to this dtype. This should be specified as a string that represents a valid pandas dtype. If not specified, this will default to `'string'`.\n", - "- `pyspark_dtype`: This value specifies the dtype to use if pyspark does not support the dtype specified by `primary_dtype`. In our example, we set this to `'string'` since Spark does not currently support the `'category'` dtype.\n", "- `standard_tags`: This is a set of semantic tags to apply to any column that is set with the specified LogicalType. If not specified, `standard_tags` will default to an empty set.\n", "- docstring: Adding a docstring for the class is optional, but if specified, this text will be used for adding a description of the type in the list of available types returned by `ww.list_logical_types()`." ] @@ -214,7 +212,7 @@ " try:\n", " series.astype(\"int\")\n", " return True\n", - " except:\n", + " except Exception:\n", " return False\n", " return False" ] diff --git a/docs/source/guides/guides_index.rst b/docs/source/guides/guides_index.rst index 8461bed9e..19e2d2bc3 100644 --- a/docs/source/guides/guides_index.rst +++ b/docs/source/guides/guides_index.rst @@ -10,6 +10,5 @@ The guides below provide more detail on the functionality of Woodwork. working_with_types_and_tags setting_config_options statistical_insights - using_woodwork_with_dask_and_spark custom_types_and_type_inference saving_and_loading_dataframes diff --git a/docs/source/guides/logical_types_and_semantic_tags.ipynb b/docs/source/guides/logical_types_and_semantic_tags.ipynb index bf6147616..e3cb8caff 100644 --- a/docs/source/guides/logical_types_and_semantic_tags.ipynb +++ b/docs/source/guides/logical_types_and_semantic_tags.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "later-pharmaceutical", + "id": "0", "metadata": {}, "source": [ "# Understanding Logical Types and Semantic Tags\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "czech-strip", + "id": "1", "metadata": {}, "source": [ "\n", @@ -40,7 +40,7 @@ { "cell_type": "code", "execution_count": null, - "id": "supposed-cookie", + "id": "2", "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "markdown", - "id": "extra-husband", + "id": "3", "metadata": {}, "source": [ "#### Standard Tags\n", @@ -92,7 +92,7 @@ { "cell_type": "code", "execution_count": null, - "id": "sunset-chassis", + "id": "4", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "professional-greek", + "id": "5", "metadata": {}, "source": [ "In the DataFrame above, we can see a `parent_type` column. The `parent_type` of a `LogicalType` refers to a logical type that is a more general version of the current `LogicalType`. See the [Custom Types and Type Inference](custom_types_and_type_inference.ipynb#Logical-Type-Relationships) guide for more details on how parent-child relationships between logical types impacts Woodwork's type inference.\n", @@ -120,7 +120,6 @@ " \"\"\"Base class for all other Logical Types\"\"\"\n", " type_string = ClassNameDescriptor()\n", " primary_dtype = 'string'\n", - " pyspark_dtype = None\n", " standard_tags = set()\n", " ```\n", "\n", @@ -137,7 +136,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bottom-alliance", + "id": "6", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "gothic-hello", + "id": "7", "metadata": {}, "outputs": [], "source": [ @@ -161,7 +160,7 @@ }, { "cell_type": "markdown", - "id": "capable-marketing", + "id": "8", "metadata": {}, "source": [ "#### Numeric Logical Types\n", @@ -216,7 +215,7 @@ { "cell_type": "code", "execution_count": null, - "id": "handled-russia", + "id": "9", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +235,7 @@ }, { "cell_type": "markdown", - "id": "forty-banks", + "id": "10", "metadata": {}, "source": [ "#### Categorical Logical Types\n", @@ -247,7 +246,6 @@ "\n", "- **physical type**: `category`\n", "- **inference**: Woodwork defines a threshold for percentage unique values relative to the size of the series below which a series will be considered categorical. See [setting config options guide](setting_config_options.ipynb#Categorical-Threshold) for more information on how to control this threshold.\n", - "- **spark note**: Spark does not support the `category` dtype, so for Spark DataFrames and Series, the `string` dtype will be used.\n", "\n", "\n", "Some examples of data for which the Categorical logical type would apply:\n", @@ -264,7 +262,6 @@ "\n", "- **physical type**: `category`\n", "- **standard tags**: `{'category'}`\n", - "- **spark note**: Spark does not support the `category` dtype, so for Spark DataFrames and Series, the `string` dtype will be used.\n", "\n", "For example: `'AU'` for Australia, `'CN'` for China, and `'CA'` for Canada.\n", "\n", @@ -277,7 +274,6 @@ "- **parameters**:\n", " - `order` - the order of the ordinal values in the column from low to high\n", "- **validation** - an order must be defined for an Ordinal column on a DataFrame or Series, and all elements of the order must be present.\n", - "- **spark note**: Spark does not support the `category` dtype, so for Spark DataFrames and Series, the `string` dtype will be used.\n", "\n", "Some examples of data for which the Ordinal logical type would apply:\n", "\n", @@ -293,7 +289,6 @@ "\n", "- **physical type**: `category`\n", "- **standard tags**: `{'category'}`\n", - "- **spark note**: Spark does not support the `category` dtype, so for Spark DataFrames and Series, the `string` dtype will be used.\n", "\n", "##### SubRegionCode\n", "\n", @@ -301,7 +296,6 @@ "\n", "- **physical type**: `category`\n", "- **standard tags**: `{'category'}`\n", - "- **spark note**: Spark does not support the `category` dtype, so for Spark DataFrames and Series, the `string` dtype will be used.\n", "\n", "For example: `'US-IL'` to represent Illinois in the United States or `'AU-TAS'` to represent Tasmania in Australia." ] @@ -309,7 +303,7 @@ { "cell_type": "code", "execution_count": null, - "id": "obvious-mayor", + "id": "11", "metadata": {}, "outputs": [], "source": [ @@ -336,7 +330,7 @@ }, { "cell_type": "markdown", - "id": "elegant-saying", + "id": "12", "metadata": {}, "source": [ "#### Miscellaneous Logical Types with Specific Formats\n", @@ -378,7 +372,6 @@ "\n", "- **physical type**: `object`\n", "- **transformation**: Will convert inputs into a tuple of floats. Any null values will be stored as `np.nan`\n", - "- **spark note**: Spark does not support tuples, so latlongs will be stored as a list of floats\n", "\n", "##### Timedelta\n", "\n", @@ -399,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "separated-brick", + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -434,7 +427,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acquired-daughter", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "opposite-retrieval", + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "imported-singapore", + "id": "16", "metadata": {}, "source": [ "#### String Logical Types\n", @@ -519,7 +512,7 @@ { "cell_type": "code", "execution_count": null, - "id": "broke-marriage", + "id": "17", "metadata": {}, "outputs": [], "source": [ @@ -585,7 +578,7 @@ }, { "cell_type": "markdown", - "id": "secure-delta", + "id": "18", "metadata": {}, "source": [ "## ColumnSchema objects\n", @@ -598,7 +591,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adult-portfolio", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -609,7 +602,7 @@ }, { "cell_type": "markdown", - "id": "separate-slovenia", + "id": "20", "metadata": {}, "source": [ "Above is the typing information for a Woodwork DataFrame. If we want, we can access just the schema of typing information outside of the context of the actual data in the DataFrame." @@ -618,7 +611,7 @@ { "cell_type": "code", "execution_count": null, - "id": "quiet-virus", + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -628,12 +621,12 @@ }, { "cell_type": "markdown", - "id": "opened-payment", + "id": "22", "metadata": {}, "source": [ "The representation of the `woodwork.table_schema.TableSchema` is only different in that it does not have a column for the physical types.\n", "\n", - "This lack of a physical type is due to the fact that a `TableSchema` has no data, and therefore no physical representation of the data. We often rely on physical typing information to know the exact pandas or Dask or Spark operations that are valid for a DataFrame, but for a schema of typing information that is not tied to data, those operations are not relevant.\n", + "This lack of a physical type is due to the fact that a `TableSchema` has no data, and therefore no physical representation of the data. We often rely on physical typing information to know the exact pandas operations that are valid for a DataFrame, but for a schema of typing information that is not tied to data, those operations are not relevant.\n", "\n", "Now, let's look at a single column of typing information, or a `woodwork.column_schema.ColumnSchema` that we can aquire in much the same way as we can select a Series from the DataFrame: " ] @@ -641,7 +634,7 @@ { "cell_type": "code", "execution_count": null, - "id": "southern-comedy", + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -653,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bottom-darwin", + "id": "24", "metadata": {}, "outputs": [], "source": [ @@ -664,7 +657,7 @@ }, { "cell_type": "markdown", - "id": "running-slovenia", + "id": "25", "metadata": {}, "source": [ "The `column_schema` object above can be understood as typing information for a single column that is not tied to any data. In this case, we happen to know where the column schema came from - it was the `quantity` column from the `retail_df` DataFrame. But we can also create a `ColumnSchema` that exists without being associated with any individual column of data.\n", @@ -675,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "artificial-conversation", + "id": "26", "metadata": {}, "outputs": [], "source": [ @@ -684,7 +677,7 @@ }, { "cell_type": "markdown", - "id": "polar-wireless", + "id": "27", "metadata": {}, "source": [ "Below are several `ColumnSchema`s that all would include our `quantity` column, but each of them describe a different type space. These `ColumnSchema`s get more restrictive as we go down:\n", @@ -699,12 +692,12 @@ }, { "cell_type": "markdown", - "id": "sexual-adoption", + "id": "28", "metadata": {}, "source": [ "## Checking for nullable logical types\n", "\n", - "Some logical types support having null values in the underlying data while others do not. This is entirely based on whether a logical type's underlying `primary_dtype` or `pyspark_dtype` supports null values. For example, the `EmailAddress` logical type has an underlying primary dtype of `string`. Pandas allows series with the dtype `string` to contain null values marked by the `pandas.NA` sentinel. Therefore, `EmailAddress` supports null values. On the other hand, the `Integer` logical type does not support null values since its underlying primary pandas dtype is `int64`. Pandas does not allow null values in series with the dtype `int64`. However, pandas does allow null values in series with the dtype `Int64`. Therefore, the `IntegerNullable` logical type supports null values since its primary dtype is `Int64`.\n", + "Some logical types support having null values in the underlying data while others do not. This is entirely based on whether a logical type's underlying `primary_dtype` supports null values. For example, the `EmailAddress` logical type has an underlying primary dtype of `string`. Pandas allows series with the dtype `string` to contain null values marked by the `pandas.NA` sentinel. Therefore, `EmailAddress` supports null values. On the other hand, the `Integer` logical type does not support null values since its underlying primary pandas dtype is `int64`. Pandas does not allow null values in series with the dtype `int64`. However, pandas does allow null values in series with the dtype `Int64`. Therefore, the `IntegerNullable` logical type supports null values since its primary dtype is `Int64`.\n", "\n", "You can check if a column contains a nullable logical type by using `nullable` on the column accessor. The sections above that describe each type's characteristics include information about whether or not a logical type is nullable." ] @@ -712,7 +705,7 @@ { "cell_type": "code", "execution_count": null, - "id": "surprised-today", + "id": "29", "metadata": {}, "outputs": [], "source": [ diff --git a/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb b/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb deleted file mode 100644 index 9e5f9ee85..000000000 --- a/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb +++ /dev/null @@ -1,354 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using Woodwork with Dask and Spark DataFrames\n", - "\n", - "Woodwork allows you to add custom typing information to Dask DataFrames or Spark DataFrames when working with datasets that are too large to easily fit in memory. Although initializing Woodwork on a Dask or Spark DataFrame follows the same process as you follow when initializing on a pandas DataFrame, there are a few limitations to be aware of. This guide provides a brief overview of using Woodwork with a Dask or Spark DataFrame. Along the way, the guide highlights several key items to keep in mind when using a Dask or Spark DataFrame as input.\n", - "\n", - "Using Woodwork with either Dask or Spark requires the installation of the Dask or Spark libraries respectively. These libraries can be installed directly with these commands:\n", - "\n", - "```python\n", - "python -m pip install \"woodwork[dask]\"\n", - "```\n", - "\n", - "```python\n", - "python -m pip install \"woodwork[spark]\"\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dask DataFrame Example\n", - "Create a Dask DataFrame to use in our example. Normally you create the DataFrame directly by reading in the data from saved files, but you will create it from a demo pandas DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import dask.dataframe as dd\n", - "\n", - "import woodwork as ww\n", - "\n", - "df_pandas = ww.demo.load_retail(nrows=1000, init_woodwork=False)\n", - "df_dask = dd.from_pandas(df_pandas, npartitions=10)\n", - "df_dask" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have a Dask DataFrame, you can use it to create a Woodwork DataFrame, just as you would with a pandas DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dask.ww.init(index=\"order_product_id\")\n", - "df_dask.ww" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see from the output above, Woodwork was initialized successfully, and logical type inference was performed for all of the columns.\n", - "\n", - "However, that illustrates one of the key issues in working with Dask DataFrames. In order to perform logical type inference, Woodwork needs to bring the data into memory so it can be analyzed. Currently, Woodwork reads data from the first partition of data only, and then uses this data for type inference. Depending on the complexity of the data, this could be a time consuming operation. Additionally, if the first partition is not representative of the entire dataset, the logical types for some columns may be inferred incorrectly.\n", - "\n", - "### Skipping or Overriding Type Inference\n", - "If this process takes too much time, or if the logical types are not inferred correctly, you can manually specify the logical types for each column. If the logical type for a column is specified, type inference for that column will be skipped. If logical types are specified for all columns, logical type inference will be skipped completely and Woodwork will not need to bring any of the data into memory during initialization.\n", - "\n", - "To skip logical type inference completely or to correct type inference issues, define a logical types dictionary with the correct logical type defined for each column in the DataFrame, then pass that dictionary to the initialization call." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logical_types = {\n", - " \"order_product_id\": \"Integer\",\n", - " \"order_id\": \"Categorical\",\n", - " \"product_id\": \"Categorical\",\n", - " \"description\": \"NaturalLanguage\",\n", - " \"quantity\": \"Integer\",\n", - " \"order_date\": \"Datetime\",\n", - " \"unit_price\": \"Double\",\n", - " \"customer_name\": \"PersonFullName\",\n", - " \"country\": \"Categorical\",\n", - " \"total\": \"Double\",\n", - " \"cancelled\": \"Boolean\",\n", - "}\n", - "\n", - "df_dask.ww.init(index=\"order_product_id\", logical_types=logical_types)\n", - "df_dask.ww" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DataFrame Statistics\n", - "There are some Woodwork methods that require bringing the underlying Dask DataFrame into memory: `describe`, `value_counts` and `mutual_information`. When called, these methods will call a `compute` operation on the DataFrame to calculate the desired information. This might be problematic for datasets that cannot fit in memory, so exercise caution when using these methods." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dask.ww.describe(include=[\"numeric\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dask.ww.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dask.ww.mutual_information().head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Spark DataFrame Example\n", - "As above, first create a Spark DataFrame to use in our example. Normally you create the DataFrame directly by reading in the data from saved files, but here you create it from a demo pandas DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# The two lines below only need to be executed if you do not have Spark properly configured.\n", - "# However if you are running into config errors, this resource may be useful:\n", - "# https://stackoverflow.com/questions/52133731/how-to-solve-cant-assign-requested-address-service-sparkdriver-failed-after\n", - "import pyspark.sql as sql\n", - "\n", - "spark = (\n", - " sql.SparkSession.builder.master(\"local[2]\")\n", - " .config(\n", - " \"spark.driver.extraJavaOptions\", \"-Dio.netty.tryReflectionSetAccessible=True\"\n", - " )\n", - " .config(\"spark.sql.shuffle.partitions\", \"2\")\n", - " .config(\"spark.driver.bindAddress\", \"127.0.0.1\")\n", - " .getOrCreate()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pyspark.pandas as ps\n", - "\n", - "df_spark = ps.from_pandas(df_pandas)\n", - "df_spark.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have a Spark DataFrame, you can initialize Woodwork, just as you would with a pandas DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_spark.ww.init(index=\"order_product_id\")\n", - "df_spark.ww" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see from the output above, Woodwork has been initialized successfully, and logical type inference was performed for all of the columns.\n", - "\n", - "### Notes on Spark Dtype Conversions\n", - "In the types table above, one important thing to notice is that the physical types for the Spark DataFrame are different than the physical types for the Dask DataFrame. The reason for this is that Spark does not support the `category` dtype that is available with pandas and Dask.\n", - "\n", - "When Woodwork is initialized, the dtype of the DataFrame columns are converted to a set of standard dtypes, defined by the LogicalType `primary_dtype` property. By default, Woodwork uses the `category` dtype for any categorical logical types, but this is not available with Spark.\n", - "\n", - "For LogicalTypes that have `primary_dtype` properties that are not compatible with Spark, Woodwork will try to convert the column dtype, but will be unsuccessful. At that point, Woodwork will use a backup dtype that is compatible with Spark. The implication of this is that using Woodwork with a Spark DataFrame may result in dtype values that are different than the values you would get when working with an otherwise identical pandas DataFrame.\n", - "\n", - "Since Spark does not support the `category` dtype, any column that is inferred or specified with a logical type of `Categorical` will have its values converted to strings and stored with a dtype of `string`. This means that a categorical column containing numeric values, will be converted into the equivalent string values.\n", - "\n", - "Finally, Spark does not support the `timedelta64[ns]` dtype. For this, there is not a clean backup dtype, so the use of `Timedelta` LogicalType is not supported with Spark DataFrames." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Skipping or Overriding Type Inference\n", - "As with Dask, Woodwork must bring the data into memory so it can be analyzed for type inference. Currently, Woodwork reads the first 100,000 rows of data to use for type inference when using a Spark DataFrame as input. If the first 100,000 rows are not representative of the entire dataset, the logical types for some columns might be inferred incorrectly.\n", - "\n", - "To skip logical type inference completely or to correct type inference issues, define a logical types dictionary with the correct logical type defined for each column in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logical_types = {\n", - " \"order_product_id\": \"Integer\",\n", - " \"order_id\": \"Categorical\",\n", - " \"product_id\": \"Categorical\",\n", - " \"description\": \"NaturalLanguage\",\n", - " \"quantity\": \"Integer\",\n", - " \"order_date\": \"Datetime\",\n", - " \"unit_price\": \"Double\",\n", - " \"customer_name\": \"PersonFullName\",\n", - " \"country\": \"Categorical\",\n", - " \"total\": \"Double\",\n", - " \"cancelled\": \"Boolean\",\n", - "}\n", - "\n", - "df_spark.ww.init(index=\"order_product_id\", logical_types=logical_types)\n", - "df_spark.ww" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DataFrame Statistics\n", - "As with Dask, running `describe`, `value_counts` or `mutual_information` requires bringing the data into memory to perform the analysis. When called, these methods will call a `to_pandas` operation on the DataFrame to calculate the desired information. This may be problematic for very large datasets, so exercise caution when using these methods." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_spark.ww.describe(include=[\"numeric\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_spark.ww.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_spark.ww.mutual_information().head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Validation Limitations\n", - "\n", - "Woodwork performs several validation checks to confirm that the data in the DataFrame is appropriate for the specified parameters. Because some of these validation steps would require pulling the data into memory, they are skipped when using Woodwork with a Dask or Spark DataFrame. This section provides an overview of the validation checks that are performed with pandas input but skipped with Dask or Spark input.\n", - "\n", - "### Index Uniqueness\n", - "Normally a check is performed to verify that any column specified as the index contains no duplicate values. With Dask or Spark input, this check is skipped and you must manually verify that any column specified as an index column contains unique values.\n", - "\n", - "### Data Consistency with LogicalType (Dask Only)\n", - "If you manually define the LogicalType for a column when initializing Woodwork, a check is performed to verify that the data in that column is appropriate for the specified LogicalType. For example, with pandas input if you specify a LogicalType of `Double` for a column that contains letters such as `['a', 'b', 'c']`, an error is raised because it is not possible to convert the letters into numeric values with the `float` dtype associated with the `Double` LogicalType.\n", - "\n", - "With Dask input, no such error appears at the time initialization. However, behind the scenes, Woodwork attempts to convert the column physical type to `float`, and this conversion is added to the Dask task graph, without raising an error. However, an error is raised if a `compute` operation is called on the DataFrame as Dask attempts to execute the conversion step. Extra care should be taken when using Dask input to make sure any specified logical types are consistent with the data in the columns to avoid this type of error.\n", - "\n", - "### Ordinal Order Values Check\n", - "For the `Ordinal` LogicalType, a check is typically performed to make sure that the data column does not contain any values that are not present in the defined order values. This check will not be performed with Dask or Spark input. Users should manually verify that the defined order values are complete to avoid unexpected results." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Other Limitations\n", - "\n", - "### Reading from CSV Files\n", - "Woodwork provides the ability to read data directly from a CSV file into a Woodwork DataFrame. The helper function used for this, `woodwork.read_file`, currently only reads the data into a pandas DataFrame. At some point, this limitation may be removed, allowing data to be read into a Dask or Spark DataFrame. For now, only pandas DataFrames can be created with this function.\n", - "\n", - "### Sorting DataFrame on Time Index\n", - "When initializing with a time index, Woodwork, by default, will sort the input DataFrame first on the time index and then on the index, if specified. Because sorting a distributed DataFrame is a computationally expensive operation, this sorting is performed only when using a pandas DataFrame. If a sorted DataFrame is needed when using a Dask or Spark, the user should manually sort the DataFrame as needed.\n", - "\n", - "### Equality of Woodwork DataFrames\n", - "In order to avoid bringing a Dask DataFrame into memory, Woodwoork does not consider the equality of the data when checking whether Woodwork Dataframe initialized from a Dask or Spark DataFrame is equal to another Woodwork DataFrame. This means that two DataFrames with identical names, columns, indices, semantic tags, and LogicalTypes but different underlying data will be treated as equal if at least one of them uses Dask or Spark.\n", - "\n", - "### LatLong Columns\n", - "When working with the LatLong logical type, Woodwork converts all LatLong columns to a standard format of a tuple of floats for Dask DataFrames and a list of floats for Spark DataFrames. In order to do this, the data is read into memory, which may be problematic for large datatsets.\n", - "\n", - "### Integer Column Names\n", - "Woodwork allows column names of any format that is supported by the DataFrame. However, Dask DataFrames do not currently support integer column names.\n", - "\n", - "### Setting DataFrame Index\n", - "When specifying a Woodwork index with a pandas DataFrame, the underlying index of the DataFrame will be updated to match the column specified as the Woodwork index. When specifying a Woodwork index on a Dask or Spark DataFrame, however, the underlying index will remain unchanged.\n", - "\n", - "### Dask `string[pyarrow]`\n", - "Woodwork may have issues with the new string storage model used by Dask. To workaround this, add `dask.config.set({'dataframe.convert-string': False})`, prior to running dask operations.\n" - ] - } - ], - "metadata": { - "celltoolbar": "Raw Cell Format", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/guides/working_with_types_and_tags.ipynb b/docs/source/guides/working_with_types_and_tags.ipynb index 9fee0d6b5..72a9bcdd7 100644 --- a/docs/source/guides/working_with_types_and_tags.ipynb +++ b/docs/source/guides/working_with_types_and_tags.ipynb @@ -49,9 +49,7 @@ "* `string`\n", "* `timedelta64[ns]`\n", "\n", - "The physical type conversion is done based on the `LogicalType` that has been specified or inferred for a given column.\n", - "\n", - "When using Woodwork with a Spark DataFrame, the physical types used may be different than those listed above. For more information, refer to the guide [Using Woodwork with Dask and Spark DataFrames](https://woodwork.alteryx.com/en/stable/guides/using_woodwork_with_dask_and_spark.html#Notes-on-Spark-Dtype-Conversions)." + "The physical type conversion is done based on the `LogicalType` that has been specified or inferred for a given column." ] }, { diff --git a/docs/source/install.md b/docs/source/install.md index 68f317e71..0be953cbb 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -20,23 +20,11 @@ $ conda install -c conda-forge woodwork Woodwork allows users to install add-ons. Woodwork allows users to install add-ons individually or all at once: -```{hint} -Be sure to install [Scala and Spark](#scala-and-spark) -``` - ````{tab} PyPI ```{tab} All Add-ons ```console $ python -m pip install "woodwork[complete]" ``` -```{tab} Dask -```console -$ python -m pip install "woodwork[dask]" -``` -```{tab} Spark -```console -$ python -m pip install "woodwork[spark]" -``` ```{tab} Update Checker ```console $ python -m pip install "woodwork[updater]" @@ -45,77 +33,25 @@ $ python -m pip install "woodwork[updater]" ````{tab} Conda ```{tab} All Add-ons ```console -$ conda install -c conda-forge dask pyspark alteryx-open-src-update-checker -``` -```{tab} Dask -```console -$ conda install -c conda-forge dask -``` -```{tab} Spark -```console -$ conda install -c conda-forge pyspark +$ conda install -c conda-forge alteryx-open-src-update-checker ``` ```{tab} Update Checker ```console $ conda install -c conda-forge alteryx-open-src-update-checker ``` ```` -- **Dask**: Use Woodwork with Dask DataFrames -- **Spark**: Use Woodwork with Spark DataFrames - **Update Checker**: Receive automatic notifications of new Woodwork releases ## Source To install Woodwork from source, clone the repository from [Github](https://github.com/alteryx/woodwork), and install the dependencies. -```{hint} -Be sure to install [Scala and Spark](#scala-and-spark) if you want to run all unit tests -``` - ```bash git clone https://github.com/alteryx/woodwork.git cd woodwork python -m pip install . ``` -## Scala and Spark - -````{tab} macOS (Intel) -:new-set: -```console -$ brew tap AdoptOpenJDK/openjdk -$ brew install --cask adoptopenjdk11 -$ brew install scala apache-spark -$ echo 'export JAVA_HOME=$(/usr/libexec/java_home)' >> ~/.zshrc -$ echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc -``` -```` - -````{tab} macOS (M1) -```console -$ brew install openjdk@11 scala apache-spark pandoc -$ echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc -$ echo 'export CPPFLAGS="-I/opt/homebrew/opt/openjdk@11/include:$CPPFLAGS"' >> ~/.zprofile -$ sudo ln -sfn /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk-11.jdk -``` -```` - -````{tab} Ubuntu -```console -$ sudo apt install openjdk-11-jre openjdk-11-jdk scala pandoc -y -$ echo "export SPARK_HOME=/opt/spark" >> ~/.profile -$ echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.profile -$ echo "export PYSPARK_PYTHON=/usr/bin/python3" >> ~/.profile -``` -```` - -````{tab} Amazon Linux -```console -$ sudo amazon-linux-extras install java-openjdk11 scala -y -$ amazon-linux-extras enable java-openjdk11 -``` -```` - ## Docker It is also possible to run Woodwork inside a Docker container. @@ -138,8 +74,6 @@ Woodwork has several other Python dependencies that are used only for specific m | boto3 | 1.34.32 | Required to read/write to URLs and S3 | | smart_open | 5.0.0 | Required to read/write to URLs and S3 | | pyarrow | 15.0.0 | Required to serialize to parquet | -| dask[distributed] | 2024.1.0 | Required to use with Dask DataFrames | -| pyspark | 3.5.0 | Required to use with Spark DataFrames | # Development diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 8319fe7fd..a5ca1a639 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,12 +9,18 @@ Future Release * Fixes * Changes * Add support for Python 3.12 :pr:`1855` + * Drop support for using Woodwork with Dask or Pyspark dataframes (:pr:`1857`) * Documentation Changes * Testing Changes Thanks to the following people for contributing to this release: :user:`thehomebrewnerd` +Breaking Changes +++++++++++++++++ +* With this release, Woodwork can no longer be used with Dask or Pyspark dataframes. The behavior when using pandas + dataframes remains unchanged. + v0.30.0 Apr 10, 2024 ==================== .. warning:: diff --git a/pyproject.toml b/pyproject.toml index 0a1a53065..32b24acdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,14 +56,6 @@ test = [ "smart-open >= 5.0.0", "pyarrow >= 14.0.1" ] -dask = [ - "dask[dataframe] >= 2024.4.1", -] -spark = [ - "pyspark >= 3.5.0", - "numpy >= 1.25.0", - "pyarrow >= 14.0.1", -] updater = [ "alteryx-open-src-update-checker >= 3.1.0" ] @@ -77,15 +69,16 @@ docs = [ "nbconvert == 6.5.0", "ipython == 8.4.0", "jupyter == 1.0.0", - "woodwork[dask, spark, test]", + "pyarrow >= 14.0.1", ] dev = [ "ruff >= 0.1.6", "pre-commit >= 2.20.0", - "click >= 8.1.7" + "click >= 8.1.7", + "woodwork[test,docs]", ] complete = [ - "woodwork[dask, spark, updater]", + "woodwork[updater]", ] [tool.setuptools] diff --git a/woodwork/accessor_utils.py b/woodwork/accessor_utils.py index 3d8395067..efe530164 100644 --- a/woodwork/accessor_utils.py +++ b/woodwork/accessor_utils.py @@ -4,10 +4,7 @@ import pandas as pd from woodwork.exceptions import ColumnNotPresentInSchemaError, WoodworkNotInitError -from woodwork.utils import _get_column_logical_type, import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") +from woodwork.utils import _get_column_logical_type def init_series( @@ -75,20 +72,12 @@ def init_series( def _is_series(data): if isinstance(data, pd.Series): return True - elif _is_dask_series(data): - return True - elif _is_spark_series(data): - return True return False def _is_dataframe(data): if isinstance(data, pd.DataFrame): return True - elif _is_dask_dataframe(data): - return True - elif _is_spark_dataframe(data): - return True return False @@ -128,8 +117,7 @@ def get_invalid_schema_message(dataframe, schema): f"dtype mismatch for column {name} between DataFrame dtype, " f"{df_dtype}, and {logical_types[name]} dtype, {valid_dtype}" ) - if schema.index is not None and isinstance(dataframe, pd.DataFrame): - # Index validation not performed for Dask/Spark + if schema.index is not None: if not pd.Series(dataframe.index, dtype=dataframe[schema.index].dtype).equals( pd.Series(dataframe[schema.index].values), ): @@ -157,30 +145,6 @@ def is_schema_valid(dataframe, schema): return True -def _is_dask_series(data): - if dd and isinstance(data, dd.Series): - return True - return False - - -def _is_dask_dataframe(data): - if dd and isinstance(data, dd.DataFrame): - return True - return False - - -def _is_spark_dataframe(data): - if ps and isinstance(data, ps.DataFrame): - return True - return False - - -def _is_spark_series(data): - if ps and isinstance(data, ps.Series): - return True - return False - - def _check_column_schema(method): """Decorator for WoodworkColumnAccessor that checks schema initialization""" diff --git a/woodwork/column_accessor.py b/woodwork/column_accessor.py index 7084b5408..c5b2c36b0 100644 --- a/woodwork/column_accessor.py +++ b/woodwork/column_accessor.py @@ -22,10 +22,7 @@ from woodwork.logical_types import _NULLABLE_PHYSICAL_TYPES, LatLong, Ordinal from woodwork.statistics_utils import _get_box_plot_info_for_column from woodwork.table_schema import TableSchema -from woodwork.utils import _get_column_logical_type, import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") +from woodwork.utils import _get_column_logical_type class WoodworkColumnAccessor: @@ -580,18 +577,3 @@ def _validate_schema(schema, series): @pd.api.extensions.register_series_accessor("ww") class PandasColumnAccessor(WoodworkColumnAccessor): pass - - -if dd: - - @dd.extensions.register_series_accessor("ww") - class DaskColumnAccessor(WoodworkColumnAccessor): - pass - - -if ps: - from pyspark.pandas.extensions import register_series_accessor - - @register_series_accessor("ww") - class SparkColumnAccessor(WoodworkColumnAccessor): - pass diff --git a/woodwork/deserialize.py b/woodwork/deserialize.py index 7caf5e536..a992a1cb4 100644 --- a/woodwork/deserialize.py +++ b/woodwork/deserialize.py @@ -44,7 +44,7 @@ def read_woodwork_table( typing_info_filename (str, optional): The name of the JSON file used to store the Woodwork typing information during serialization. Defaults to "woodwork_typing_info.json". format (str, optional): The format used to serialize the data. Required if the serialized filename suffix does not - match the format or when deserializing from parquet files into Dask or Spark dataframes. + match the format. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. validate (bool, optional): Whether parameter and data validation should occur when initializing Woodwork dataframe diff --git a/woodwork/deserializers/csv_deserializer.py b/woodwork/deserializers/csv_deserializer.py index 19cf1aa00..ed5538d40 100644 --- a/woodwork/deserializers/csv_deserializer.py +++ b/woodwork/deserializers/csv_deserializer.py @@ -1,3 +1,5 @@ +import pandas as pd + from woodwork.deserializers.deserializer_base import Deserializer @@ -7,5 +9,4 @@ class CSVDeserializer(Deserializer): format = "csv" def read_from_local_path(self): - lib = self._get_library() - return lib.read_csv(self.read_path, dtype=self.column_dtypes, **self.kwargs) + return pd.read_csv(self.read_path, dtype=self.column_dtypes, **self.kwargs) diff --git a/woodwork/deserializers/deserializer_base.py b/woodwork/deserializers/deserializer_base.py index 547574931..418e426f3 100644 --- a/woodwork/deserializers/deserializer_base.py +++ b/woodwork/deserializers/deserializer_base.py @@ -11,7 +11,7 @@ from woodwork.exceptions import OutdatedSchemaWarning, UpgradeSchemaWarning from woodwork.s3_utils import get_transport_params, use_smartopen from woodwork.serializers.serializer_base import SCHEMA_VERSION -from woodwork.utils import _is_s3, _is_url, import_or_raise +from woodwork.utils import _is_s3, _is_url PYARROW_IMPORT_ERROR_MESSAGE_DESERIALIZE = ( f"The pyarrow library is required to deserialize from {format}.\n" @@ -93,7 +93,7 @@ def _set_init_dict(self, loading_info): else: cat_object = pd.CategoricalDtype(pd.Series(cat_values)) col_type = cat_object - elif table_type == "spark" and col_type == "object": + elif col_type == "object": col_type = "string" self.column_dtypes[col_name] = col_type @@ -138,32 +138,6 @@ def read_from_local_path(self): "Must define read_from_local_path on Deserializer subclass", ) # pragma: no cover - def _get_library(self): - table_type = self.typing_info["loading_info"]["table_type"] - if table_type == "dask": - DASK_ERR_MSG = ( - "Cannot load Dask DataFrame - unable to import Dask.\n\n" - "Please install with pip or conda:\n\n" - 'python -m pip install "woodwork[dask]"\n\n' - "conda install dask" - ) - lib = import_or_raise("dask.dataframe", DASK_ERR_MSG) - elif table_type == "spark": - SPARK_ERR_MSG = ( - "Cannot load Spark DataFrame - unable to import Spark.\n\n" - "Please install with pip or conda:\n\n" - 'python -m pip install "woodwork[spark]"\n\n' - "conda install spark\n\n" - "conda install pyspark" - ) - lib = import_or_raise("pyspark.pandas", SPARK_ERR_MSG) - if "compression" in self.kwargs.keys(): - self.kwargs["compression"] = str(self.kwargs["compression"]) - else: - lib = pd - - return lib - def _check_schema_version(saved_version_str): """Warns users if the schema used to save their data is greater than the latest diff --git a/woodwork/deserializers/feather_deserializer.py b/woodwork/deserializers/feather_deserializer.py index b3dd5a4a9..4bde6a7e1 100644 --- a/woodwork/deserializers/feather_deserializer.py +++ b/woodwork/deserializers/feather_deserializer.py @@ -1,3 +1,5 @@ +import pandas as pd + from woodwork.deserializers.deserializer_base import Deserializer @@ -7,5 +9,4 @@ class FeatherDeserializer(Deserializer): format = "feather" def read_from_local_path(self): - lib = self._get_library() - return lib.read_feather(self.read_path) + return pd.read_feather(self.read_path) diff --git a/woodwork/deserializers/orc_deserializer.py b/woodwork/deserializers/orc_deserializer.py index b21731050..8af035646 100644 --- a/woodwork/deserializers/orc_deserializer.py +++ b/woodwork/deserializers/orc_deserializer.py @@ -1,3 +1,5 @@ +import pandas as pd + from woodwork.deserializers.deserializer_base import Deserializer @@ -7,5 +9,4 @@ class OrcDeserializer(Deserializer): format = "orc" def read_from_local_path(self): - lib = self._get_library() - return lib.read_orc(self.read_path) + return pd.read_orc(self.read_path) diff --git a/woodwork/deserializers/parquet_deserializer.py b/woodwork/deserializers/parquet_deserializer.py index 2123729ba..27a03ce29 100644 --- a/woodwork/deserializers/parquet_deserializer.py +++ b/woodwork/deserializers/parquet_deserializer.py @@ -4,6 +4,8 @@ import tempfile from pathlib import Path +import pandas as pd + from woodwork.deserializers.deserializer_base import ( PYARROW_IMPORT_ERROR_MESSAGE_DESERIALIZE, Deserializer, @@ -46,8 +48,7 @@ def configure_deserializer(self): def read_from_local_path(self): self.configure_deserializer() - lib = self._get_library() - return lib.read_parquet(self.read_path, engine=self.kwargs["engine"]) + return pd.read_parquet(self.read_path, engine=self.kwargs["engine"]) def read_from_s3(self, profile_name): with tempfile.TemporaryDirectory() as tmpdir: @@ -67,18 +68,4 @@ def read_from_s3(self, profile_name): return self.read_from_local_path() def _set_metadata_path(self): - # If we are reading a single pandas file, we get the metadata from the file. - # If we are reading into Dask/Spark we need to get the metadata from the - # first file that was serialized. self.metadata_path = self.read_path - if os.path.isdir(self.read_path): - files = os.listdir(self.read_path) - if "part.0.parquet" in files: - # Dask will serialize with "part.*.parquet" file names - self.metadata_path = os.path.join(self.read_path, "part.0.parquet") - elif any(["snappy.parquet" in f for f in files]): - # Spark will serialize files with a unique hash but with the ".snappy.parquet" extension - parquet_files = sorted( - [f for f in files if Path(f).suffix == ".parquet"], - ) - self.metadata_path = os.path.join(self.read_path, parquet_files[0]) diff --git a/woodwork/deserializers/pickle_deserializer.py b/woodwork/deserializers/pickle_deserializer.py index c1ed16493..85b5fe0dc 100644 --- a/woodwork/deserializers/pickle_deserializer.py +++ b/woodwork/deserializers/pickle_deserializer.py @@ -1,3 +1,5 @@ +import pandas as pd + from woodwork.deserializers.deserializer_base import Deserializer @@ -7,5 +9,4 @@ class PickleDeserializer(Deserializer): format = "pickle" def read_from_local_path(self): - lib = self._get_library() - return lib.read_pickle(self.read_path, **self.kwargs) + return pd.read_pickle(self.read_path, **self.kwargs) diff --git a/woodwork/deserializers/utils.py b/woodwork/deserializers/utils.py index bd750ad1e..2d25c775c 100644 --- a/woodwork/deserializers/utils.py +++ b/woodwork/deserializers/utils.py @@ -44,7 +44,7 @@ def _get_deserializer( typing_info_filename (str, optional): The name of the JSON file used to store the Woodwork typing information during serialization. Defaults to "woodwork_typing_info.json". format (str, optional): The format used to serialize the data. Required if the serialized filename suffix does not - match the format or when deserializing from parquet files into Dask or Spark dataframes. + match the format. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. diff --git a/woodwork/indexers.py b/woodwork/indexers.py index 545a73966..752f6903a 100644 --- a/woodwork/indexers.py +++ b/woodwork/indexers.py @@ -1,8 +1,6 @@ import copy from woodwork.accessor_utils import ( - _is_dask_dataframe, - _is_dask_series, _is_dataframe, _is_series, ) @@ -11,10 +9,6 @@ class _iLocIndexer: def __init__(self, data): self.data = data - if _is_dask_dataframe(data): - raise TypeError("iloc is not supported for Dask DataFrames") - elif _is_dask_series(data): - raise TypeError("iloc is not supported for Dask Series") def __getitem__(self, key): selection = self.data.iloc[key] @@ -32,11 +26,7 @@ def __getitem__(self, key): def _process_selection(selection, original_data): if _is_series(selection): - if _is_dask_series(selection): - # Dask index values are a delayed object - can't compare below without computing - index_vals = selection.index.values.compute() - else: - index_vals = selection.index.values + index_vals = selection.index.values if _is_dataframe(original_data) and set(index_vals) == set( original_data.columns, ): diff --git a/woodwork/logical_types.py b/woodwork/logical_types.py index de57de59e..8851cd90e 100644 --- a/woodwork/logical_types.py +++ b/woodwork/logical_types.py @@ -1,4 +1,3 @@ -import re import warnings from datetime import datetime from typing import Optional @@ -9,7 +8,6 @@ from pandas.api import types as pdtypes import woodwork as ww -from woodwork.accessor_utils import _is_dask_series, _is_spark_series from woodwork.config import config from woodwork.exceptions import ( TypeConversionError, @@ -23,13 +21,8 @@ _is_valid_latlong_value, _reformat_to_latlong, camel_to_snake, - import_or_none, ) -dd = import_or_none("dask.dataframe") -dask_expr = import_or_none("dask_expr") -ps = import_or_none("pyspark.pandas") - class ClassNameDescriptor(object): """Descriptor to convert a class's name from camelcase to snakecase""" @@ -48,7 +41,6 @@ class LogicalType(object, metaclass=LogicalTypeMetaClass): type_string = ClassNameDescriptor() primary_dtype = "string" - pyspark_dtype = None standard_tags = set() def __eq__(self, other, deep=False): @@ -62,10 +54,7 @@ def __str__(self): @classmethod def _get_valid_dtype(cls, series_type): """Return the dtype that is considered valid for a series with the given logical_type""" - if ps and series_type == ps.Series and cls.pyspark_dtype: - return cls.pyspark_dtype - else: - return cls.primary_dtype + return cls.primary_dtype def transform(self, series, null_invalid_values=False): """Converts the series dtype to match the logical type's if it is different.""" @@ -228,11 +217,8 @@ def transform(self, series, null_invalid_values=False): ve = ValueError( "Expected no null values in this Boolean column. If you want to keep the nulls, use BooleanNullable type. Otherwise, cast these nulls to a boolean value with the `cast_null_as` parameter.", ) - is_dask = _is_dask_series(series) if not pdtypes.is_dtype_equal("bool", series.dtype): - if (is_dask and series.isna().any().compute()) or ( - not is_dask and series.isna().any() - ): + if series.isna().any(): if self.cast_nulls_as is None: raise ve series.fillna(self.cast_nulls_as, inplace=True) @@ -272,7 +258,6 @@ class Categorical(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} def __init__(self, encoding=None): @@ -293,7 +278,6 @@ class CountryCode(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} @@ -308,7 +292,6 @@ class CurrencyCode(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} @@ -358,52 +341,31 @@ def _year_filter(date): series, ) utc = self.datetime_format and self.datetime_format.endswith("%z") - if _is_dask_series(series): - name = series.name - series = dd.to_datetime( + + try: + series = pd.to_datetime( series, format=self.datetime_format, - errors="coerce", utc=utc, ) - series.name = name - elif _is_spark_series(series): - series = ps.Series( - ps.to_datetime( - series.to_numpy(), - format=self.datetime_format, - errors="coerce", - ), - name=series.name, + except (TypeError, ValueError): + warnings.warn( + f"Some rows in series '{series.name}' are incompatible with datetime format " + f"'{self.datetime_format}' and have been replaced with null values. You may be " + "able to fix this by using an instantiated Datetime logical type with a different format " + "string specified for this column during Woodwork initialization.", + TypeConversionWarning, + ) + series = pd.to_datetime( + series, + format=self.datetime_format, + errors="coerce", + utc=utc, ) - else: - try: - series = pd.to_datetime( - series, - format=self.datetime_format, - utc=utc, - ) - except (TypeError, ValueError): - warnings.warn( - f"Some rows in series '{series.name}' are incompatible with datetime format " - f"'{self.datetime_format}' and have been replaced with null values. You may be " - "able to fix this by using an instantiated Datetime logical type with a different format " - "string specified for this column during Woodwork initialization.", - TypeConversionWarning, - ) - series = pd.to_datetime( - series, - format=self.datetime_format, - errors="coerce", - utc=utc, - ) series = self._remove_timezone(series) if self.datetime_format is not None and "%y" in self.datetime_format: - if _is_spark_series(series): - series = series.transform(_year_filter) - else: - series = series.apply(_year_filter) + series = series.apply(_year_filter) return super().transform(series) @@ -558,8 +520,7 @@ class LatLong(LogicalType): Note: LatLong values will be stored with the object dtype as a - tuple of floats (or a list of floats for Spark DataFrames) - and must contain only two values. + tuple of floats and must contain only two values. Null latitude or longitude values will be stored as np.nan, and a fully null LatLong (np.nan, np.nan) will be stored as just a @@ -576,22 +537,10 @@ class LatLong(LogicalType): primary_dtype = "object" def transform(self, series, null_invalid_values=False): - """Formats a series to be a tuple (or list for Spark) of two floats.""" + """Formats a series to be a tuple of two floats.""" if null_invalid_values: series = _coerce_latlong(series) - - if _is_dask_series(series): - name = series.name - meta = (name, tuple([float, float])) - series = series.apply(_reformat_to_latlong, meta=meta) - elif _is_spark_series(series): - formatted_series = series.to_pandas().apply( - _reformat_to_latlong, - is_spark=True, - ) - series = ps.from_pandas(formatted_series) - else: - series = series.apply(_reformat_to_latlong) + series = series.apply(_reformat_to_latlong) return super().transform(series) @@ -653,7 +602,6 @@ class Ordinal(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} def __init__(self, order=None): @@ -743,7 +691,6 @@ class SubRegionCode(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} @@ -806,7 +753,6 @@ class PostalCode(LogicalType): """ primary_dtype = "category" - pyspark_dtype = "string" standard_tags = {"category"} def transform(self, series, null_invalid_values=False): @@ -831,8 +777,6 @@ def validate(self, series, return_invalid_values=False): Returns: Series: If return_invalid_values is True, returns invalid PostalCodes. """ - if _is_dask_series(series): - series = series.compute() return _regex_validate( "postal_code_inference_regex", series, @@ -869,8 +813,6 @@ def _regex_validate(regex_key, series, return_invalid_values): else: any_invalid = invalid.any() - if dd and isinstance(any_invalid, (dd.core.Scalar, dask_expr.Scalar)): - any_invalid = any_invalid.compute() if any_invalid: type_string = { @@ -896,8 +838,7 @@ def _replace_nans(series: pd.Series, primary_dtype: Optional[str] = None) -> pd. if str(original_dtype) == "string": series = series.replace(ww.config.get_option("nan_values"), pd.NA) return series - if not _is_spark_series(series): - series = series.replace(ww.config.get_option("nan_values"), np.nan) + series = series.replace(ww.config.get_option("nan_values"), np.nan) if str(original_dtype) == "boolean": series = series.astype(original_dtype) @@ -912,8 +853,6 @@ def _validate_age(series, return_invalid_values): else: any_invalid = invalid.any() - if dd and isinstance(any_invalid, (dd.core.Scalar, dask_expr.Scalar)): - any_invalid = any_invalid.compute() if any_invalid: info = f"Series {series.name} contains negative values." @@ -926,17 +865,7 @@ def _get_index_invalid_integer(series): def _get_index_invalid_string(series, regex_key): regex = config.get_option(regex_key) - - if _is_spark_series(series): - - def match(x): - if isinstance(x, str): - return bool(re.match(regex, x)) - - return series.apply(match).astype("boolean") == False # noqa: E712 - - else: - return ~series.str.match(regex).astype("boolean") + return ~series.str.match(regex).astype("boolean") def _get_index_invalid_age(series): diff --git a/woodwork/serializers/csv_serializer.py b/woodwork/serializers/csv_serializer.py index c024828c9..7ddefb368 100644 --- a/woodwork/serializers/csv_serializer.py +++ b/woodwork/serializers/csv_serializer.py @@ -1,7 +1,6 @@ import glob import os -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe from woodwork.exceptions import WoodworkFileExistsError from woodwork.serializers.serializer_base import Serializer @@ -21,24 +20,13 @@ def __init__(self, **kwargs): } def serialize(self, dataframe, profile_name, **kwargs): - if _is_spark_dataframe(dataframe): - if self.filename is not None: - raise ValueError( - "Writing a Spark dataframe to csv with a filename specified is not supported", - ) - self.default_kwargs["multiline"] = True - self.default_kwargs["ignoreLeadingWhitespace"] = False - self.default_kwargs["ignoreTrailingWhitespace"] = False self.kwargs = {**self.default_kwargs, **kwargs} return super().serialize(dataframe, profile_name, **kwargs) def _get_filename(self): if self.filename is None: ww_name = self.dataframe.ww.name or "data" - if _is_dask_dataframe(self.dataframe): - basename = "{}-*.{}".format(ww_name, self.format) - else: - basename = ".".join([ww_name, self.format]) + basename = ".".join([ww_name, self.format]) else: basename = self.filename self.location = basename @@ -56,12 +44,6 @@ def write_dataframe(self): # engine kwarg not needed for writing, only reading if "engine" in csv_kwargs.keys(): del csv_kwargs["engine"] - if _is_spark_dataframe(self.dataframe): - dataframe = self.dataframe.ww.copy() - columns = list(dataframe.select_dtypes("object").columns) - dataframe[columns] = dataframe[columns].astype(str) - csv_kwargs["compression"] = str(csv_kwargs["compression"]) - else: - dataframe = self.dataframe + dataframe = self.dataframe file = self._get_filename() dataframe.to_csv(file, **csv_kwargs) diff --git a/woodwork/serializers/orc_serializer.py b/woodwork/serializers/orc_serializer.py index f77413498..8bfae26a8 100644 --- a/woodwork/serializers/orc_serializer.py +++ b/woodwork/serializers/orc_serializer.py @@ -1,4 +1,3 @@ -from woodwork.accessor_utils import _is_dask_dataframe from woodwork.serializers.serializer_base import ( PYARROW_IMPORT_ERROR_MESSAGE, Serializer, @@ -14,10 +13,6 @@ class OrcSerializer(Serializer): def serialize(self, dataframe, profile_name, **kwargs): import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE) - # Serialization to orc relies on pyarrow.Table.from_pandas which doesn't work with Dask - if _is_dask_dataframe(dataframe): - msg = "DataFrame type not compatible with orc serialization. Please serialize to another format." - raise ValueError(msg) self.kwargs["engine"] = "pyarrow" return super().serialize(dataframe, profile_name, **kwargs) diff --git a/woodwork/serializers/parquet_serializer.py b/woodwork/serializers/parquet_serializer.py index c2ac28405..e0a9abdc6 100644 --- a/woodwork/serializers/parquet_serializer.py +++ b/woodwork/serializers/parquet_serializer.py @@ -1,12 +1,8 @@ import json import os import warnings -from pathlib import Path -import pandas as pd - -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe -from woodwork.exceptions import ParametersIgnoredWarning, WoodworkFileExistsError +from woodwork.exceptions import ParametersIgnoredWarning from woodwork.serializers.serializer_base import ( PYARROW_IMPORT_ERROR_MESSAGE, Serializer, @@ -31,14 +27,6 @@ def __init__(self, path, filename, data_subdirectory, typing_info_filename): self.typing_info_filename = None def serialize(self, dataframe, profile_name, **kwargs): - if self.filename is not None and _is_dask_dataframe(dataframe): - raise ValueError( - "Writing a Dask dataframe to parquet with a filename specified is not supported", - ) - if self.filename is not None and _is_spark_dataframe(dataframe): - raise ValueError( - "Writing a Spark dataframe to parquet with a filename specified is not supported", - ) self.kwargs["engine"] = "pyarrow" return super().serialize(dataframe, profile_name, **kwargs) @@ -56,17 +44,15 @@ def save_to_local_path(self): def _create_pyarrow_table(self): """Create a pyarrow table for pandas. This table will get updated to included - Woodwork typing info before saving. Skip for Dask/Spark because for those formats - typing information has to be added after files are saved to disk.""" + Woodwork typing info before saving.""" import pyarrow as pa - if isinstance(self.dataframe, pd.DataFrame): - dataframe = clean_latlong(self.dataframe) - self.table = pa.Table.from_pandas(dataframe) + dataframe = clean_latlong(self.dataframe) + self.table = pa.Table.from_pandas(dataframe) def _generate_parquet_metadata(self): """Generate metadata for the parquet file header. For pandas this includes additional - information needed by pandas. For Dask/Spark, this includes only the Woodwork typing info. + information needed by pandas. """ loading_info = { "location": self.location, @@ -74,63 +60,17 @@ def _generate_parquet_metadata(self): "params": self.kwargs, } self.typing_info["loading_info"].update(loading_info) - # For Dask and Spark we only get the WW metadata because we haven't created - # the pyarrow table yet, but for pandas we combine the existing parquet - # metadata with the WW metadata. - if _is_dask_dataframe(self.dataframe) or _is_spark_dataframe(self.dataframe): - metadata = { - "ww_meta".encode(): json.dumps(self.typing_info).encode(), - } - else: - table_metadata = self.table.schema.metadata - metadata = { - "ww_meta".encode(): json.dumps(self.typing_info).encode(), - **table_metadata, - } + table_metadata = self.table.schema.metadata + metadata = { + "ww_meta".encode(): json.dumps(self.typing_info).encode(), + **table_metadata, + } self.metadata = metadata def _save_parquet_table_to_disk(self): """Writes data to disk with the updated metadata including WW typing info.""" from pyarrow import parquet as pq - if _is_dask_dataframe(self.dataframe): - path, dataframe = self._setup_for_dask_and_spark() - dataframe.to_parquet(path, custom_metadata=self.metadata) - elif _is_spark_dataframe(self.dataframe): - path, dataframe = self._setup_for_dask_and_spark() - dataframe.to_parquet(path) - files = os.listdir(path) - - # Update first parquet file to save WW metadata - parquet_files = sorted([f for f in files if Path(f).suffix == ".parquet"]) - update_file = os.path.join(path, parquet_files[0]) - table = pq.read_table(update_file) - table_metadata = table.schema.metadata - combined_meta = { - **self.metadata, - **table_metadata, - } - table = table.replace_schema_metadata(combined_meta) - pq.write_table(table, update_file, use_deprecated_int96_timestamps=True) - - # Remove checksum files which prevent deserialization if present due to updated parquet header - crc_files = [f for f in files if Path(f).suffix == ".crc"] - for file in crc_files: - os.remove(os.path.join(path, file)) - else: - file = self._get_filename() - self.table = self.table.replace_schema_metadata(self.metadata) - pq.write_table(self.table, file) - - def _setup_for_dask_and_spark(self): - """Perform additional path setup required for Dask/Spark. Since Dask/Spark deserialize to - directories only, the `_get_filename` method does not work like it does for pandas. - """ - path = self.path - if self.data_subdirectory is not None: - path = os.path.join(path, self.data_subdirectory) - if any([Path(f).suffix == ".parquet" for f in os.listdir(path)]): - message = f"Data file already exists at '{path}'. " - message += "Please remove or use a different directory." - raise WoodworkFileExistsError(message) - return path, clean_latlong(self.dataframe) + file = self._get_filename() + self.table = self.table.replace_schema_metadata(self.metadata) + pq.write_table(self.table, file) diff --git a/woodwork/serializers/pickle_serializer.py b/woodwork/serializers/pickle_serializer.py index f9ccdc9cc..35083c7c3 100644 --- a/woodwork/serializers/pickle_serializer.py +++ b/woodwork/serializers/pickle_serializer.py @@ -1,5 +1,3 @@ -import pandas as pd - from woodwork.serializers.serializer_base import Serializer @@ -9,9 +7,5 @@ class PickleSerializer(Serializer): format = "pickle" def write_dataframe(self): - if not isinstance(self.dataframe, pd.DataFrame): - msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." - raise ValueError(msg) - file = self._get_filename() self.dataframe.to_pickle(file, **self.kwargs) diff --git a/woodwork/serializers/serializer_base.py b/woodwork/serializers/serializer_base.py index 5a161e3e2..c1bcf6455 100644 --- a/woodwork/serializers/serializer_base.py +++ b/woodwork/serializers/serializer_base.py @@ -4,7 +4,6 @@ import tarfile import tempfile -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe from woodwork.exceptions import WoodworkFileExistsError from woodwork.logical_types import LatLong from woodwork.s3_utils import get_transport_params, use_smartopen @@ -146,14 +145,6 @@ def typing_info_to_dict(dataframe): Returns: dict: Dictionary containing Woodwork typing information """ - if _is_dask_dataframe(dataframe): - # Need to determine the category info for Dask it can be saved below - category_cols = [ - colname - for colname, col in dataframe.ww._schema.columns.items() - if col.is_categorical - ] - dataframe = dataframe.ww.categorize(columns=category_cols) ordered_columns = dataframe.columns def _get_physical_type_dict(column): @@ -181,12 +172,7 @@ def _get_physical_type_dict(column): for col_name, col in dataframe.ww.columns.items() ] - if _is_dask_dataframe(dataframe): - table_type = "dask" - elif _is_spark_dataframe(dataframe): - table_type = "spark" - else: - table_type = "pandas" + table_type = "pandas" return { "schema_version": SCHEMA_VERSION, diff --git a/woodwork/statistics_utils/_get_box_plot_info_for_column.py b/woodwork/statistics_utils/_get_box_plot_info_for_column.py index 32e5b50d6..9e39e7ccf 100644 --- a/woodwork/statistics_utils/_get_box_plot_info_for_column.py +++ b/woodwork/statistics_utils/_get_box_plot_info_for_column.py @@ -6,11 +6,6 @@ import woodwork as ww from woodwork.statistics_utils._get_medcouple_statistic import _sample_for_medcouple -from woodwork.utils import import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - method_result = namedtuple( "MedcoupleHeuristicResult", @@ -147,11 +142,6 @@ def _get_box_plot_info_for_column( if quantiles and not isinstance(quantiles, dict): raise TypeError("quantiles must be a dictionary.") - if dd and isinstance(series, dd.Series): - series = series.compute() - if ps and isinstance(series, ps.Series): - series = series.to_pandas() - # remove null values from the data series = series.dropna() diff --git a/woodwork/statistics_utils/_get_dependence_dict.py b/woodwork/statistics_utils/_get_dependence_dict.py index 2f7bfec69..512447bf5 100644 --- a/woodwork/statistics_utils/_get_dependence_dict.py +++ b/woodwork/statistics_utils/_get_dependence_dict.py @@ -4,7 +4,6 @@ import numpy as np -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe from woodwork.statistics_utils._bin_numeric_cols_into_categories import ( _bin_numeric_cols_into_categories, ) @@ -162,10 +161,6 @@ def _get_dependence_dict( data = dataframe_with_bools_to_int.loc[:, valid_columns] # cut off data if necessary - if _is_dask_dataframe(data): - data = data.compute() - elif _is_spark_dataframe(dataframe_with_bools_to_int): - data = data.to_pandas() if nrows is not None and nrows < data.shape[0]: data = data.sample(nrows, random_state=random_seed) @@ -291,19 +286,14 @@ def categorical_column_drop_helper(df): total = sum(df_uniques.to_numpy()) if total > total_unique: # try to use mergesort to keep the order of the columns - if not _is_spark_dataframe(df): - drop = df_uniques.sort_values(ascending=False, kind="mergesort").index[ - 0 - ] - else: - drop = df_uniques.sort_values(ascending=False).index.tolist()[0] + drop = df_uniques.sort_values(ascending=False, kind="mergesort").index[0] cols_to_drop.append(drop) df = df.drop(cols_to_drop, axis=1) cols_to_drop += categorical_column_drop_helper(df) return cols_to_drop categoricals = datatable.ww.select("category").columns - # dask dataframe does not have support for `nunique`, but it should be a feature coming in a future release - if len(categoricals) and not _is_dask_dataframe(datatable): + + if len(categoricals): return categorical_column_drop_helper(datatable[categoricals]) return [] diff --git a/woodwork/statistics_utils/_get_describe_dict.py b/woodwork/statistics_utils/_get_describe_dict.py index db43a0206..ffe5b1887 100644 --- a/woodwork/statistics_utils/_get_describe_dict.py +++ b/woodwork/statistics_utils/_get_describe_dict.py @@ -4,14 +4,12 @@ import pandas as pd -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe from woodwork.logical_types import ( Age, AgeNullable, Datetime, Integer, IntegerNullable, - LatLong, Unknown, ) from woodwork.statistics_utils._get_histogram_values import _get_histogram_values @@ -110,24 +108,6 @@ def _get_describe_dict( results = {} - if _is_dask_dataframe(dataframe): - df = dataframe.compute() - elif _is_spark_dataframe(dataframe): - df = dataframe.to_pandas() - - # Any LatLong columns will be using lists, which we must convert - # back to tuples so we can calculate the mode, which requires hashable values - latlong_columns = [ - col_name - for col_name, col in dataframe.ww.columns.items() - if type(col.logical_type) == LatLong - ] - df[latlong_columns] = df[latlong_columns].applymap( - lambda latlong: tuple(latlong) if latlong else latlong, - ) - else: - df = dataframe - # Setup for progress callback and make initial call # Assume 1 unit for general preprocessing, plus main loop over column total_loops = 1 + len(cols_to_include) @@ -141,7 +121,7 @@ def _get_describe_dict( values = {} logical_type = column.logical_type semantic_tags = column.semantic_tags - series = df[column_name] + series = dataframe[column_name] agg_stats_to_calculate = { "category": ["count", "nunique"], @@ -165,9 +145,6 @@ def _get_describe_dict( values = series.agg(agg_stats).to_dict() mode = _get_mode(series) - # The format of the mode should match its format in the DataFrame - if _is_spark_dataframe(dataframe) and series.name in latlong_columns: - mode = list(mode) if column.is_latlong: nan_count = series.apply(_is_latlong_nan).sum() diff --git a/woodwork/statistics_utils/_get_value_counts.py b/woodwork/statistics_utils/_get_value_counts.py index d4ee5d794..24248846e 100644 --- a/woodwork/statistics_utils/_get_value_counts.py +++ b/woodwork/statistics_utils/_get_value_counts.py @@ -1,8 +1,3 @@ -import numpy as np - -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe - - def _get_value_counts(dataframe, ascending=False, top_n=10, dropna=False): """Returns a list of dictionaries with counts for the most frequent values in each column (only for columns with `category` as a standard tag). @@ -28,20 +23,9 @@ def _get_value_counts(dataframe, ascending=False, top_n=10, dropna=False): col for col, column in dataframe.ww.columns.items() if column.is_categorical ] data = dataframe[valid_cols] - is_ks = False - if _is_dask_dataframe(data): - data = data.compute() - if _is_spark_dataframe(data): - data = data.to_pandas() - is_ks = True for col in valid_cols: - if dropna and is_ks: - # Spark categorical columns will have missing values replaced with the string 'None' - # Replace them with np.nan so dropna work - datacol = data[col].replace(to_replace="None", value=np.nan) - else: - datacol = data[col] + datacol = data[col] frequencies = datacol.value_counts(ascending=ascending, dropna=dropna) df = frequencies[:top_n].reset_index() df.columns = ["value", "count"] diff --git a/woodwork/table_accessor.py b/woodwork/table_accessor.py index 88b8991e8..12ee2598a 100644 --- a/woodwork/table_accessor.py +++ b/woodwork/table_accessor.py @@ -7,9 +7,7 @@ from woodwork.accessor_utils import ( _check_table_schema, - _is_dask_dataframe, _is_dataframe, - _is_spark_dataframe, get_invalid_schema_message, init_series, ) @@ -35,10 +33,7 @@ from woodwork.table_schema import TableSchema from woodwork.type_sys.utils import _is_numeric_series, col_is_datetime from woodwork.typing import AnyDataFrame, ColumnName, UseStandardTagsDict -from woodwork.utils import _get_column_logical_type, _parse_logical_type, import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") +from woodwork.utils import _get_column_logical_type, _parse_logical_type class WoodworkTableAccessor: @@ -282,11 +277,7 @@ def __eq__(self, other, deep=True): return False # Only check pandas DataFrames for equality - if ( - deep - and isinstance(self._dataframe, pd.DataFrame) - and isinstance(other.ww._dataframe, pd.DataFrame) - ): + if deep: return self._dataframe.equals(other.ww._dataframe) return True @@ -324,8 +315,7 @@ def __getitem__(self, key): return series def __setitem__(self, col_name, column): - series = tuple(pkg.Series for pkg in (pd, dd, ps) if pkg) - if not isinstance(column, series): + if not isinstance(column, pd.Series): raise ValueError("New column must be of Series type") if column.ww.schema is not None and "index" in column.ww.semantic_tags: @@ -707,8 +697,6 @@ def to_disk( ) def _sort_columns(self, already_sorted): - if _is_dask_dataframe(self._dataframe) or _is_spark_dataframe(self._dataframe): - already_sorted = True # Skip sorting for Dask and Spark input if not already_sorted: sort_cols = [self._schema.time_index, self._schema.index] if self._schema.index is None: @@ -718,9 +706,9 @@ def _sort_columns(self, already_sorted): def _set_underlying_index(self): """Sets the index of the underlying DataFrame to match the index column as specified by the TableSchema. Does not change the underlying index if no Woodwork index is - specified. Only sets underlying index for pandas DataFrames. + specified. """ - if isinstance(self._dataframe, pd.DataFrame) and self._schema.index is not None: + if self._schema.index is not None: self._dataframe.set_index(self._schema.index, drop=False, inplace=True) # Drop index name to not overlap with the original column self._dataframe.index.name = None @@ -801,12 +789,6 @@ def _get_subset_df_with_schema(self, cols_to_include, inplace=False): """Creates a new DataFrame from a list of column names with Woodwork initialized, retaining all typing information and maintaining the DataFrame's column order. """ - if inplace: - if _is_dask_dataframe(self._dataframe): - raise ValueError("Drop inplace not supported for Dask") - if _is_spark_dataframe(self._dataframe): - raise ValueError("Drop inplace not supported for Spark") - assert all([col_name in self._schema.columns for col_name in cols_to_include]) new_schema = self._schema.get_subset_schema(cols_to_include) @@ -888,10 +870,6 @@ def rename(self, columns, inplace=False): new_schema = self._schema.rename(columns) if inplace: - if _is_dask_dataframe(self._dataframe): - raise ValueError("Rename inplace not supported for Dask") - if _is_spark_dataframe(self._dataframe): - raise ValueError("Rename inplace not supported for Spark") self._dataframe.rename(columns=columns, inplace=True) self.init_with_full_schema(schema=new_schema) return @@ -1614,7 +1592,6 @@ def value_counts(self, ascending=False, top_n=10, dropna=False): def infer_temporal_frequencies(self, temporal_columns=None, debug=False): """Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta. - Not supported for Dask and Spark DataFrames. Args: temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns @@ -1682,11 +1659,6 @@ def validate_logical_types(self, return_invalid_values=False): if return_invalid_values and invalid_values: concat = pd.concat - if _is_dask_dataframe(self._dataframe): - concat = dd.concat - if _is_spark_dataframe(self._dataframe): - concat = ps.concat - return concat(invalid_values, axis=1) @@ -1740,9 +1712,8 @@ def _check_index(dataframe, index): raise ColumnNotPresentError( f"Specified index column `{index}` not found in dataframe", ) - if index is not None and isinstance(dataframe, pd.DataFrame): + if index is not None: # User specifies a dataframe index that is not unique or contains null values - # Does not check Dask dataframes to avoid pulling data into memory and Dask does not support is_unique if not dataframe[index].is_unique: raise IndexError("Index column must be unique") @@ -1861,12 +1832,8 @@ def _infer_missing_logical_types( null_invalid_values=null_invalid_values, ) if updated_series is not series: - # NotImplementedError thrown by dask when attempting to re-initialize - # data after being assigned a numeric column name - try: - dataframe[name] = updated_series - except NotImplementedError: - pass + dataframe[name] = updated_series + return parsed_logical_types @@ -1897,21 +1864,3 @@ def _merge_use_standard_tags( @pd.api.extensions.register_dataframe_accessor("ww") class PandasTableAccessor(WoodworkTableAccessor): pass - - -if dd: - - @dd.extensions.register_dataframe_accessor("ww") - class DaskTableAccessor(WoodworkTableAccessor): - pass - - -if ps: - from pyspark.pandas.extensions import register_dataframe_accessor - - @register_dataframe_accessor("ww") - class SparkTableAccessor(WoodworkTableAccessor): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if not ps.get_option("compute.ops_on_diff_frames"): - ps.set_option("compute.ops_on_diff_frames", True) diff --git a/woodwork/tests/accessor/test_column_accessor.py b/woodwork/tests/accessor/test_column_accessor.py index 8770a2714..4bcd4ce59 100644 --- a/woodwork/tests/accessor/test_column_accessor.py +++ b/woodwork/tests/accessor/test_column_accessor.py @@ -6,10 +6,7 @@ import pytest from woodwork.accessor_utils import ( - _is_dask_dataframe, - _is_dask_series, _is_dataframe, - _is_spark_series, init_series, ) from woodwork.column_accessor import WoodworkColumnAccessor @@ -34,15 +31,9 @@ SubRegionCode, ) from woodwork.tests.testing_utils import ( - concat_dataframe_or_series, is_property, is_public_method, - to_pandas, ) -from woodwork.utils import import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") def test_accessor_init(sample_series): @@ -96,12 +87,8 @@ def test_accessor_init_with_schema_errors(sample_series): with pytest.raises(TypeError, match=error): head_series.ww.init(schema=int) - if _is_spark_series(sample_series): - ltype_dtype = "string" - new_dtype = " None: +def test_ordinal_transform_validates(ordinal_transform_series) -> None: typ = Ordinal(order=None) with pytest.raises(TypeError, match=r"order values defined"): - typ.transform(ordinal_transform_series_pandas) + typ.transform(ordinal_transform_series) -def test_ordinal_transform_pandas(ordinal_transform_series_pandas) -> None: +def test_ordinal_transform(ordinal_transform_series) -> None: order = [2, 1, 3] typ = Ordinal(order=order) - ser_ = typ.transform(ordinal_transform_series_pandas) + ser_ = typ.transform(ordinal_transform_series) assert ser_.dtype == "category" pd.testing.assert_index_equal(ser_.cat.categories, pd.Index(order, dtype="int64")) -def test_ordinal_transform_dask(ordinal_transform_series_dask) -> None: - order = [2, 1, 3] - typ = Ordinal(order=order) - ser_ = typ.transform(ordinal_transform_series_dask).compute() - - assert ser_.dtype == "category" - pd.testing.assert_index_equal(ser_.cat.categories, pd.Index(order, dtype="int64")) - - -def test_ordinal_transform_spark(ordinal_transform_series_spark) -> None: - order = [2, 1, 3] - typ = Ordinal(order=order) - ser_ = typ.transform(ordinal_transform_series_spark) - - assert ser_.dtype == pd.StringDtype() - - def test_get_valid_dtype(sample_series): valid_dtype = Categorical._get_valid_dtype(type(sample_series)) - if _is_spark_series(sample_series): - assert valid_dtype == "string" - else: - assert valid_dtype == "category" + assert valid_dtype == "category" valid_dtype = Boolean._get_valid_dtype(type(sample_series)) assert valid_dtype == "bool" def test_latlong_transform(latlong_df): - df_type = str(type(latlong_df)) - dask = "dask" in df_type - spark = "spark" in df_type nan = float("nan") expected_data = { @@ -160,11 +124,6 @@ def test_latlong_transform(latlong_df): series = latlong_df[column] actual = latlong.transform(series) - if dask: - actual = actual.compute() - elif spark: - actual = actual.to_pandas() - expected = pd.Series(expected_data[column], name=column) pd.testing.assert_series_equal(actual, expected) @@ -174,11 +133,6 @@ def test_latlong_transform_empty_series(empty_latlong_df): series = empty_latlong_df["latlong"] actual = latlong.transform(series) - if _is_dask_series(actual): - actual = actual.compute() - elif _is_spark_series(actual): - actual = actual.to_pandas() - assert actual.empty assert actual.name == "latlong" assert actual.dtype == latlong.primary_dtype @@ -239,16 +193,7 @@ def test_datetime_coerce_user_format(): assert datetime.datetime_format == "%m/%d/%Y" -def test_ordinal_transform(sample_series): - series_type = str(type(sample_series)) - dask = "dask" in series_type - spark = "spark" in series_type - - if dask or spark: - pytest.xfail( - "Fails with Dask and Spark - ordinal data validation not supported", - ) - +def test_ordinal_transform_missing_vals(sample_series): ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " @@ -260,15 +205,6 @@ def test_ordinal_transform(sample_series): def test_ordinal_validate(sample_series): - series_type = str(type(sample_series)) - dask = "dask" in series_type - spark = "spark" in series_type - - if dask or spark: - pytest.xfail( - "Fails with Dask and Spark - ordinal data validation not supported", - ) - ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " @@ -292,12 +228,9 @@ def test_email_address_validate(sample_df): series = sample_df["email"].astype(dtype) invalid_row = pd.Series({4: "bad_email"}, name="email").astype(dtype) - if _is_spark_series(series): - invalid_row = ps.from_pandas(invalid_row) - assert email_address.validate(series) is None - series = concat_dataframe_or_series(series, invalid_row).astype(dtype) + series = pd.concat([series, invalid_row]).astype(dtype) match = "Series email contains invalid email address values. " match += "The email_inference_regex can be changed in the config if needed." @@ -307,7 +240,7 @@ def test_email_address_validate(sample_df): actual = email_address.validate(series, return_invalid_values=True) expected = pd.Series({4: "bad_email"}, name="email").astype(dtype) - assert to_pandas(actual).equals(expected) + assert actual.equals(expected) def test_url_validate(sample_df): @@ -315,12 +248,10 @@ def test_url_validate(sample_df): dtype = logical_type.primary_dtype series = sample_df["url"].astype(dtype) invalid_row = pd.Series({4: "bad_url"}, name="url").astype(dtype) - if _is_spark_series(series): - invalid_row = ps.from_pandas(invalid_row) assert logical_type.validate(series) is None - series = concat_dataframe_or_series(series, invalid_row).astype(dtype) + series = pd.concat([series, invalid_row]).astype(dtype) match = "Series url contains invalid url values. " match += "The url_inference_regex can be changed in the config if needed." @@ -330,7 +261,7 @@ def test_url_validate(sample_df): actual = logical_type.validate(series, return_invalid_values=True) expected = pd.Series({4: "bad_url"}, name="url").astype(dtype) - assert to_pandas(actual).equals(expected) + assert actual.equals(expected) @pytest.mark.parametrize( @@ -349,17 +280,14 @@ def test_age_validate(sample_df, logical_type): assert logical_type.validate(series, return_invalid_values=False) is None invalid_row = pd.Series({4: -3}, name="age", dtype=dtype) - if _is_spark_series(series): - invalid_row = ps.from_pandas(invalid_row) - - series = concat_dataframe_or_series(series, invalid_row).astype(dtype) + series = pd.concat([series, invalid_row]).astype(dtype) match = "Series age contains negative values." with pytest.raises(TypeValidationError, match=match): logical_type.validate(series, return_invalid_values=False) actual = logical_type.validate(series, return_invalid_values=True) - assert to_pandas(actual).equals(to_pandas(invalid_row)) + assert actual.equals(invalid_row) def test_phone_number_validate(sample_df): @@ -368,12 +296,9 @@ def test_phone_number_validate(sample_df): series = sample_df["phone_number"].astype(dtype) invalid_row = pd.Series({4: "bad_phone"}, name="phone_number").astype(dtype) - if _is_spark_series(series): - invalid_row = ps.from_pandas(invalid_row) - assert phone_number.validate(series) is None - series = concat_dataframe_or_series(series, invalid_row).astype(dtype) + series = pd.concat([series, invalid_row]).astype(dtype) match = "Series phone_number contains invalid phone number values. " match += "The phone_inference_regex can be changed in the config if needed." @@ -383,7 +308,7 @@ def test_phone_number_validate(sample_df): actual = phone_number.validate(series, return_invalid_values=True) expected = pd.Series({4: "bad_phone"}, name="phone_number").astype(dtype) - assert to_pandas(actual).equals(expected) + assert actual.equals(expected) def test_phone_number_validate_complex(sample_df_phone_numbers): @@ -396,14 +321,14 @@ def test_phone_number_validate_complex(sample_df_phone_numbers): name="phone_number", ).astype(dtype) - series = concat_dataframe_or_series(series, invalid_row).astype(dtype) + series = pd.concat([series, invalid_row]).astype(dtype) actual = phone_number.validate(series, return_invalid_values=True) expected = pd.Series( {17: "252 9384", 18: "+1 194 129 1991", 19: "+01 236 248 8482"}, name="phone_number", ).astype(dtype) - assert to_pandas(actual).equals(expected) + assert actual.equals(expected) def test_postal_code_validate(sample_df_postal_code): @@ -417,7 +342,7 @@ def test_postal_code_validate(sample_df_postal_code): ], ) - series = concat_dataframe_or_series(series, invalid_types) + series = pd.concat([series, invalid_types]) series.name = "postal_code" match = "Series postal_code contains invalid postal code values. " @@ -447,7 +372,7 @@ def test_postal_code_validate_complex(sample_df_postal_code): actual = pc.validate(series, return_invalid_values=True) assert not len(actual) - series = concat_dataframe_or_series(series, invalid_types) + series = pd.concat([series, invalid_types]) actual = pc.validate(series, return_invalid_values=True) pd.testing.assert_series_equal(actual, invalid_types) @@ -455,7 +380,7 @@ def test_postal_code_validate_complex(sample_df_postal_code): def test_postal_code_validate_numeric(postal_code_numeric_series): series = init_series(postal_code_numeric_series, logical_type=PostalCode()) - actual = to_pandas(series.ww.validate_logical_type(return_invalid_values=True)) + actual = series.ww.validate_logical_type(return_invalid_values=True) expected = pd.Series({5: "1234567890"}) pd.testing.assert_series_equal( @@ -466,11 +391,8 @@ def test_postal_code_validate_numeric(postal_code_numeric_series): ) -def test_postal_code_error(postal_code_numeric_series_pandas): - series = concat_dataframe_or_series( - postal_code_numeric_series_pandas, - pd.Series([1234.5]), - ) +def test_postal_code_error(postal_code_numeric_series): + series = pd.concat([postal_code_numeric_series, pd.Series([1234.5])]) match = ( "Error converting datatype for None from type float64 to type string. " "Please confirm the underlying data is consistent with logical type PostalCode." @@ -874,52 +796,6 @@ def test_datetime_pivot_point_should_not_apply(delim): pd.testing.assert_frame_equal(df, df_expected) -@pytest.mark.parametrize("type", ["pyspark", "dask"]) -def test_pyspark_dask_series(type): - dates = [ - "01/01/24", - "01/01/28", - "01/01/30", - "01/01/32", - "01/01/36", - "01/01/40", - "01/01/72", - None, - "01/01/88", - ] - datetime_str = "%m/%d/%y" - expected_values = [ - "2024-01-01", - "2028-01-01", - "2030-01-01", - "1932-01-01", - "1936-01-01", - "1940-01-01", - "1972-01-01", - None, - "1988-01-01", - ] - expected_values = get_expected_dates(expected_values) - df = pd.DataFrame({"dates": dates}) - if type == "pyspark": - ps = pytest.importorskip( - "pyspark.pandas", - reason="Pyspark pandas not installed, skipping", - ) - df = ps.from_pandas(df) - else: - dd = pytest.importorskip( - "dask.dataframe", - reason="Dask not installed, skipping", - ) - df = dd.from_pandas(df, npartitions=2) - df.ww.init(logical_types={"dates": Datetime(datetime_format=datetime_str)}) - df_expected = pd.DataFrame({"dates": expected_values}, dtype="datetime64[ns]") - df = to_pandas(df) - df.sort_index(inplace=True) - pd.testing.assert_frame_equal(df, df_expected) - - def test_datetime_pivot_point_no_format_provided(): dates = [ "01/01/24", @@ -995,8 +871,7 @@ def test_datetime_formats_two_digit_years_ambiguous(): pd.testing.assert_series_equal(series, series_expected) -@pytest.mark.parametrize("df_type", ["pandas", "dask", "spark"]) -def test_boolean_other_values(df_type): +def test_boolean_other_values(): df = pd.DataFrame( { "bool2": ["t", "f", "t", "f", "t", "t"], @@ -1007,18 +882,6 @@ def test_boolean_other_values(df_type): "bool9": ["N", "N", "n", "y", "Y", "y"], }, ) - if df_type == "spark": - ps = pytest.importorskip( - "pyspark.pandas", - reason="Pyspark pandas not installed, skipping", - ) - df = ps.from_pandas(df) - elif df_type == "dask": - dd = pytest.importorskip( - "dask.dataframe", - reason="Dask not installed, skipping", - ) - df = dd.from_pandas(df, npartitions=1) df.ww.init() assert all([str(dtype) == "Boolean" for dtype in df.ww.logical_types.values()]) @@ -1074,9 +937,8 @@ def test_boolean_with_null_error(series, null, cast_null): ] -@pytest.mark.parametrize("df_type", ["pandas", "dask", "spark"]) @pytest.mark.parametrize("null", [None, pd.NA, np.nan]) -def test_boolean_nullable_other_values_dont_cast(null, df_type): +def test_boolean_nullable_other_values_dont_cast(null): df = pd.DataFrame( { "bool1": ["N", "N", "n", null, "Y", "y"], @@ -1087,18 +949,6 @@ def test_boolean_nullable_other_values_dont_cast(null, df_type): "bool7": ["YES", "NO", "YES", "yes", null, "no"], }, ) - if df_type == "spark": - ps = pytest.importorskip( - "pyspark.pandas", - reason="Pyspark pandas not installed, skipping", - ) - df = ps.from_pandas(df) - elif df_type == "dask": - dd = pytest.importorskip( - "dask.dataframe", - reason="Dask not installed, skipping", - ) - df = dd.from_pandas(df, npartitions=1) df.ww.init() assert all( [str(dtype) == "BooleanNullable" for dtype in df.ww.logical_types.values()], @@ -1205,12 +1055,6 @@ def test_object_dtype_inference(comprehensive_df): df_copy_objects.ww.init( logical_types={col: Unknown for col in df_copy_objects.columns}, ) - if _is_dask_dataframe(df_copy): - df_copy = df_copy.ww.compute() - df_copy_objects = df_copy_objects.ww.compute() - elif _is_spark_dataframe(df_copy): - df_copy = df_copy.ww.to_pandas() - df_copy_objects = df_copy_objects.ww.to_pandas() # Confirm proper Woodwork inference for pandas-inferred object columns assert { col: str(ltype) for col, ltype in df_copy.ww.logical_types.items() diff --git a/woodwork/tests/requirement_files/latest_dask_dependencies.txt b/woodwork/tests/requirement_files/latest_dask_dependencies.txt deleted file mode 100644 index 721f36214..000000000 --- a/woodwork/tests/requirement_files/latest_dask_dependencies.txt +++ /dev/null @@ -1,7 +0,0 @@ -click==8.1.7 -dask==2024.5.0 -dask-expr==1.1.0 -numpy==1.26.4 -pandas==2.2.2 -pyarrow==16.0.0 -scikit-learn==1.4.2 diff --git a/woodwork/tests/requirement_files/latest_spark_dependencies.txt b/woodwork/tests/requirement_files/latest_spark_dependencies.txt deleted file mode 100644 index d1cc1f9c5..000000000 --- a/woodwork/tests/requirement_files/latest_spark_dependencies.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy==1.26.4 -pandas==2.2.2 -pyarrow==16.0.0 -pyspark==3.5.1 -scikit-learn==1.4.2 diff --git a/woodwork/tests/requirement_files/minimum_dask_requirements.txt b/woodwork/tests/requirement_files/minimum_dask_requirements.txt deleted file mode 100644 index b116008ba..000000000 --- a/woodwork/tests/requirement_files/minimum_dask_requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -dask[dataframe]==2024.4.1 -importlib-resources==5.10.0 -numpy==1.25.0 -pandas==2.0.0 -python-dateutil==2.8.2 -scikit-learn==1.1.0 -scipy==1.10.0 diff --git a/woodwork/tests/requirement_files/minimum_spark_requirements.txt b/woodwork/tests/requirement_files/minimum_spark_requirements.txt deleted file mode 100644 index f1569a232..000000000 --- a/woodwork/tests/requirement_files/minimum_spark_requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -importlib-resources==5.10.0 -numpy==1.25.0 -pandas==2.0.0 -pyarrow==14.0.1 -pyspark==3.5.0 -python-dateutil==2.8.2 -scikit-learn==1.1.0 -scipy==1.10.0 diff --git a/woodwork/tests/testing_utils/__init__.py b/woodwork/tests/testing_utils/__init__.py index 83190091d..1c9a4abb7 100644 --- a/woodwork/tests/testing_utils/__init__.py +++ b/woodwork/tests/testing_utils/__init__.py @@ -4,27 +4,13 @@ from woodwork.tests.testing_utils.table_utils import ( _check_close, check_empty_box_plot_dict, - concat_dataframe_or_series, dep_between_cols, is_property, is_public_method, - to_pandas, validate_subset_schema, ) -def pd_to_dask(series): - dask = pytest.importorskip("dask", reason="Dask not installed, skipping") - dask.config.set({"dataframe.convert-string": False}) - dd = dask.dataframe - return dd.from_pandas(series, npartitions=1) - - -def pd_to_spark(series): - ps = pytest.importorskip("pyspark.pandas", reason="Spark not installed, skipping") - return ps.from_pandas(convert_tuples_to_lists(series)) - - def convert_tuples_to_lists(series): def apply_func(value): if type(value) is tuple: diff --git a/woodwork/tests/testing_utils/table_utils.py b/woodwork/tests/testing_utils/table_utils.py index 140b11e11..c6ecc26f8 100644 --- a/woodwork/tests/testing_utils/table_utils.py +++ b/woodwork/tests/testing_utils/table_utils.py @@ -1,12 +1,6 @@ import numpy as np import pandas as pd -from woodwork import accessor_utils -from woodwork.utils import import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - def validate_subset_schema(subset_schema, schema): assert subset_schema.name == schema.name @@ -28,40 +22,6 @@ def dep_between_cols(col1, col2, dep_name, df): return dep_series.iloc[0] -def to_pandas(df, index=None, sort_index=False, str_to_object=False): - """Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe. - - Args: - index sets the index, default = None - sort_index (bool) sort, default = False - str_to_object (bool) convert string to object for comparison, default = False - - - Returns: - Pandas DataFrame - """ - if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)): - return df - - if dd and isinstance(df, (dd.DataFrame, dd.Series, dd.Index)): - pd_df = df.compute() - - if ps and isinstance(df, (ps.DataFrame, ps.Series, ps.Index)): - pd_df = df.to_pandas() - - if index: - pd_df = pd_df.set_index(index, drop=False) - if sort_index: - pd_df = pd_df.sort_index() - - if str_to_object: - return pd_df.astype( - {col: "object" for col in pd_df.select_dtypes("string").columns}, - ) - - return pd_df - - def is_public_method(class_to_check, name): """Determine if the specified name is a public method on a class""" if hasattr(class_to_check, name) and name[0] != "_": @@ -108,30 +68,3 @@ def _check_close(actual, expected): assert pd.isnull(actual) else: np.testing.assert_allclose(actual, expected, atol=1e-3) - - -def concat_dataframe_or_series(base, to_add): - """Selects and calls the appropriate concat method based on the type of the base and to_add Series/DataFrame - - Args: - base: base Series/DataFrame - to_add: Series/DataFrame to be concatenated - - Returns: - Series/DataFrame: result of concatenation - """ - dd = import_or_none("dask.dataframe") - ps = import_or_none("pyspark.pandas") - - if isinstance(base, (pd.Series, pd.DataFrame)): - concatenated_obj = pd.concat([base, to_add]) - elif accessor_utils._is_dask_dataframe( - base, - ) or accessor_utils._is_dask_series(base): - concatenated_obj = dd.concat([base, to_add]) - elif accessor_utils._is_spark_dataframe( - base, - ) or accessor_utils._is_spark_series(base): - concatenated_obj = ps.concat([base, to_add]) - - return concatenated_obj diff --git a/woodwork/tests/type_system/conftest.py b/woodwork/tests/type_system/conftest.py index f7db1e55c..8b7066d34 100644 --- a/woodwork/tests/type_system/conftest.py +++ b/woodwork/tests/type_system/conftest.py @@ -5,7 +5,6 @@ import pytest from woodwork.logical_types import Categorical, CountryCode, Double, Integer, Unknown -from woodwork.tests.testing_utils import pd_to_dask, pd_to_spark from woodwork.type_sys.inference_functions import ( categorical_func, double_func, @@ -16,7 +15,7 @@ # Integer Inference Fixtures @pytest.fixture -def pandas_integers(): +def integers(): return [ pd.Series(4 * [-1, 2, 1, 7]), pd.Series(4 * [-1, 0, 5, 3]), @@ -24,24 +23,9 @@ def pandas_integers(): ] -@pytest.fixture -def dask_integers(pandas_integers): - return [pd_to_dask(series) for series in pandas_integers] - - -@pytest.fixture -def spark_integers(pandas_integers): - return [pd_to_spark(series) for series in pandas_integers] - - -@pytest.fixture(params=["pandas_integers", "dask_integers", "spark_integers"]) -def integers(request): - return request.getfixturevalue(request.param) - - # Double Inference Fixtures @pytest.fixture -def pandas_doubles(): +def doubles(): return [ pd.Series(4 * [-1, 2.5, 1, 7]), pd.Series(4 * [1.5, np.nan, 1, 3]), @@ -50,24 +34,9 @@ def pandas_doubles(): ] -@pytest.fixture -def dask_doubles(pandas_doubles): - return [pd_to_dask(series) for series in pandas_doubles] - - -@pytest.fixture -def spark_doubles(pandas_doubles): - return [pd_to_spark(series) for series in pandas_doubles] - - -@pytest.fixture(params=["pandas_doubles", "dask_doubles", "spark_doubles"]) -def doubles(request): - return request.getfixturevalue(request.param) - - # Boolean Inference Fixtures @pytest.fixture -def pandas_bools(): +def bools(): return [ pd.Series([True, False, True, True]), pd.Series([True, np.nan, True, True]), @@ -82,48 +51,18 @@ def pandas_bools(): ] -@pytest.fixture -def dask_bools(pandas_bools): - return [pd_to_dask(series) for series in pandas_bools] - - -@pytest.fixture -def spark_bools(pandas_bools): - return [pd_to_spark(series) for series in pandas_bools] - - -@pytest.fixture(params=["pandas_bools", "dask_bools", "spark_bools"]) -def bools(request): - return request.getfixturevalue(request.param) - - # Datetime Inference Fixtures @pytest.fixture -def pandas_datetimes(): +def datetimes(): return [ pd.Series(["2000-3-11", "2000-3-12", "2000-03-13", "2000-03-14"]), pd.Series(["2000-3-11", np.nan, "2000-03-13", "2000-03-14"]), ] -@pytest.fixture -def dask_datetimes(pandas_datetimes): - return [pd_to_dask(series) for series in pandas_datetimes] - - -@pytest.fixture -def spark_datetimes(pandas_datetimes): - return [pd_to_spark(series) for series in pandas_datetimes] - - -@pytest.fixture(params=["pandas_datetimes", "dask_datetimes", "spark_datetimes"]) -def datetimes(request): - return request.getfixturevalue(request.param) - - # Email Inference Fixtures @pytest.fixture -def pandas_emails(): +def emails(): return [ pd.Series( ["fl@alteryx.com", "good@email.com", "boaty@mcboatface.com", "foo@bar.com"], @@ -135,24 +74,9 @@ def pandas_emails(): ] -@pytest.fixture -def dask_emails(pandas_emails): - return [pd_to_dask(series) for series in pandas_emails] - - -@pytest.fixture -def spark_emails(pandas_emails): - return [pd_to_spark(series) for series in pandas_emails] - - -@pytest.fixture(params=["pandas_emails", "dask_emails", "spark_emails"]) -def emails(request): - return request.getfixturevalue(request.param) - - # Email Inference Fixtures @pytest.fixture -def bad_pandas_emails(): +def bad_emails(): return [ pd.Series(["fl@alteryx.com", "not_an_email", "good@email.com", "foo@bar.com"]), pd.Series(["fl@alteryx.com", "b☃d@email.com", "good@email.com", np.nan]), @@ -165,24 +89,9 @@ def bad_pandas_emails(): ] -@pytest.fixture -def bad_dask_emails(bad_pandas_emails): - return [pd_to_dask(series) for series in bad_pandas_emails] - - -@pytest.fixture -def bad_spark_emails(bad_pandas_emails): - return [pd_to_spark(series) for series in bad_pandas_emails] - - -@pytest.fixture(params=["bad_pandas_emails", "bad_dask_emails", "bad_spark_emails"]) -def bad_emails(request): - return request.getfixturevalue(request.param) - - # Categorical Inference Fixtures @pytest.fixture -def pandas_categories(): +def categories(): return [ pd.Series(10 * ["a", "b", "a", "b"]), pd.Series(10 * ["1", "2", "1", "2"]), @@ -192,22 +101,7 @@ def pandas_categories(): @pytest.fixture -def dask_categories(pandas_categories): - return [pd_to_dask(series) for series in pandas_categories] - - -@pytest.fixture -def spark_categories(pandas_categories): - return [pd_to_spark(series) for series in pandas_categories] - - -@pytest.fixture(params=["pandas_categories", "dask_categories", "spark_categories"]) -def categories(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pandas_categories_dtype(): +def categories_dtype(): return pd.DataFrame( { "cat": pd.Series(["a", "b", "c", "d"], dtype="category"), @@ -216,40 +110,18 @@ def pandas_categories_dtype(): ) -@pytest.fixture -def dask_categories_dtype(pandas_categories_dtype): - return pd_to_dask(pandas_categories_dtype) - - -@pytest.fixture(params=["pandas_categories_dtype", "dask_categories_dtype"]) -def categories_dtype(request): - # Spark doesn't support the "category" dtype. We just leave it out for - # now. - return request.getfixturevalue(request.param) - - # Timedelta Inference Fixtures @pytest.fixture -def pandas_timedeltas(): +def timedeltas(): return [ pd.Series(pd.to_timedelta(range(4), unit="s")), pd.Series([pd.to_timedelta(1, unit="s"), np.nan]), ] -@pytest.fixture -def dask_timedeltas(pandas_timedeltas): - return [pd_to_dask(series) for series in pandas_timedeltas] - - -@pytest.fixture(params=["pandas_timedeltas", "dask_timedeltas"]) -def timedeltas(request): - return request.getfixturevalue(request.param) - - # Natural Language Fixtures @pytest.fixture -def pandas_natural_language(): +def natural_language(): return [ pd.Series( [ @@ -261,30 +133,9 @@ def pandas_natural_language(): ] -@pytest.fixture -def dask_natural_language(pandas_natural_language): - return [pd_to_dask(series) for series in pandas_natural_language] - - -@pytest.fixture -def spark_natural_language(pandas_natural_language): - return [pd_to_spark(series) for series in pandas_natural_language] - - -@pytest.fixture( - params=[ - "pandas_natural_language", - "dask_natural_language", - "spark_natural_language", - ], -) -def natural_language(request): - return request.getfixturevalue(request.param) - - # Postal Inference Fixtures @pytest.fixture -def pandas_postal_codes(): +def postal(): return [ pd.Series(10 * ["77002", "55106"]), pd.Series(10 * ["77002-0000", "55106-0000"]), @@ -292,26 +143,9 @@ def pandas_postal_codes(): ] -@pytest.fixture -def dask_postal_codes(pandas_postal_codes): - return [pd_to_dask(series) for series in pandas_postal_codes] - - -@pytest.fixture -def spark_postal_codes(pandas_postal_codes): - return [pd_to_spark(series) for series in pandas_postal_codes] - - -@pytest.fixture( - params=["pandas_postal_codes", "dask_postal_codes", "spark_postal_codes"], -) -def postal(request): - return request.getfixturevalue(request.param) - - # Unknown Inference Fixtures @pytest.fixture -def pandas_strings(): +def strings(): return [ pd.Series( ["Mr. John Doe", "Doe, Mrs. Jane", "James Brown", "Ms. Paige Turner"], @@ -322,24 +156,9 @@ def pandas_strings(): ] -@pytest.fixture -def dask_strings(pandas_strings): - return [pd_to_dask(series) for series in pandas_strings] - - -@pytest.fixture -def spark_strings(pandas_strings): - return [pd_to_spark(series) for series in pandas_strings] - - -@pytest.fixture(params=["pandas_strings", "dask_strings", "spark_strings"]) -def strings(request): - return request.getfixturevalue(request.param) - - # pd.NA Inference Fixtures @pytest.fixture -def pandas_pdnas(): +def pdnas(): return [ pd.Series( [ @@ -357,42 +176,15 @@ def pandas_pdnas(): ] -@pytest.fixture -def dask_pdnas(pandas_pdnas): - return [pd_to_dask(series) for series in pandas_pdnas] - - -@pytest.fixture(params=["pandas_pdnas", "dask_pdnas"]) -def pdnas(request): - return request.getfixturevalue(request.param) - - # Empty Series Inference Fixtures @pytest.fixture -def pandas_empty_series(): +def empty_series(): return pd.Series([], dtype="object") -@pytest.fixture -def dask_empty_series(pandas_empty_series): - return pd_to_dask(pandas_empty_series) - - -@pytest.fixture -def pyspark_empty_series(pandas_empty_series): - return pd_to_spark(pandas_empty_series) - - -@pytest.fixture( - params=["pandas_empty_series", "dask_empty_series", "pyspark_empty_series"], -) -def empty_series(request): - return request.getfixturevalue(request.param) - - # Null Inference Fixtures @pytest.fixture -def pandas_nulls(): +def nulls(): return [ pd.Series([pd.NA, pd.NA, pd.NA, pd.NA]), pd.Series([np.nan, np.nan, np.nan, np.nan]), @@ -403,22 +195,7 @@ def pandas_nulls(): @pytest.fixture -def dask_nulls(pandas_nulls): - return [pd_to_dask(series) for series in pandas_nulls] - - -@pytest.fixture -def spark_nulls(pandas_nulls): - return [pd_to_spark(series) for series in pandas_nulls] - - -@pytest.fixture(params=["pandas_nulls", "dask_nulls", "spark_nulls"]) -def nulls(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def pandas_large_df(): +def large_df(): df = pd.DataFrame() df["int_nullable"] = [int(i) for i in range(INFERENCE_SAMPLE_SIZE)] + [np.nan] df["bool_nullable"] = [True, False] * int(INFERENCE_SAMPLE_SIZE // 2) + [pd.NA] @@ -427,21 +204,6 @@ def pandas_large_df(): return df -@pytest.fixture -def dask_large_df(pandas_large_df): - return pd_to_dask(pandas_large_df) - - -@pytest.fixture -def spark_large_df(pandas_large_df): - return pd_to_spark(pandas_large_df) - - -@pytest.fixture(params=["pandas_large_df", "dask_large_df", "spark_large_df"]) -def large_df(request): - return request.getfixturevalue(request.param) - - @pytest.fixture def default_inference_functions(): return { @@ -469,7 +231,7 @@ def type_sys(default_inference_functions, default_relationships): # URL Inference Fixtures @pytest.fixture -def pandas_urls(): +def urls(): return [ pd.Series([f"http://url{i}.com" for i in range(100)]), pd.Series( @@ -484,48 +246,18 @@ def pandas_urls(): ] -@pytest.fixture -def dask_urls(pandas_urls): - return [pd_to_dask(series) for series in pandas_urls] - - -@pytest.fixture -def spark_urls(pandas_urls): - return [pd_to_spark(series) for series in pandas_urls] - - -@pytest.fixture(params=["pandas_urls", "dask_urls", "spark_urls"]) -def urls(request): - return request.getfixturevalue(request.param) - - # Phone Number Inference Fixtures @pytest.fixture -def pandas_phone(): +def phone(): return [ pd.Series([f"200.200.786{i}" for i in range(9)]), pd.Series(["311-311-3156", "(755) 755 7109", "+1(288)-288-7772"] * 3), ] -@pytest.fixture -def dask_phone(pandas_phone): - return [pd_to_dask(series) for series in pandas_phone] - - -@pytest.fixture -def spark_phone(pandas_phone): - return [pd_to_spark(series) for series in pandas_phone] - - -@pytest.fixture(params=["pandas_phone", "dask_phone", "spark_phone"]) -def phone(request): - return request.getfixturevalue(request.param) - - # IP Address Inference Fixtures @pytest.fixture -def pandas_ip(): +def ip(): return [ pd.Series( [ @@ -537,18 +269,3 @@ def pandas_ip(): ), pd.Series([f"172.16.254.{i}" for i in range(6)]), ] - - -@pytest.fixture -def dask_ip(pandas_ip): - return [pd_to_dask(series) for series in pandas_ip] - - -@pytest.fixture -def spark_ip(pandas_ip): - return [pd_to_spark(series) for series in pandas_ip] - - -@pytest.fixture(params=["pandas_ip", "dask_ip", "spark_ip"]) -def ip(request): - return request.getfixturevalue(request.param) diff --git a/woodwork/tests/type_system/test_ltype_inference.py b/woodwork/tests/type_system/test_ltype_inference.py index b9ec2de67..59973dc34 100644 --- a/woodwork/tests/type_system/test_ltype_inference.py +++ b/woodwork/tests/type_system/test_ltype_inference.py @@ -1,7 +1,6 @@ from unittest.mock import patch import woodwork as ww -from woodwork.accessor_utils import _is_dask_series, _is_spark_series from woodwork.logical_types import ( URL, Boolean, @@ -20,39 +19,16 @@ Timedelta, Unknown, ) -from woodwork.tests.testing_utils import to_pandas from woodwork.type_sys.type_system import ( DEFAULT_INFERENCE_FUNCTIONS, DEFAULT_RELATIONSHIPS, DEFAULT_TYPE, TypeSystem, ) -from woodwork.utils import import_or_none - -UNSUPPORTED_SPARK_DTYPES = [ - "int32", - "intp", - "uint8", - "uint16", - "uint32", - "uint64", - "uintp", - "float_", - "object", - "category", -] - -ps = import_or_none("pyspark.pandas") - - -def get_spark_dtypes(dtypes): - return [dtype for dtype in dtypes if dtype not in UNSUPPORTED_SPARK_DTYPES] def test_integer_inference(integers): dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] - if _is_spark_series(integers[0]): - dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: @@ -62,8 +38,6 @@ def test_integer_inference(integers): def test_double_inference(doubles): dtypes = ["float", "float32", "float64", "float_"] - if _is_spark_series(doubles[0]): - dtypes = get_spark_dtypes(dtypes) for series in doubles: for dtype in dtypes: @@ -75,13 +49,11 @@ def test_boolean_inference(bools): dtypes = ["bool", "boolean"] for series in bools: for dtype in dtypes: - if _is_dask_series(series): - series = series.compute() cast_series = series if True in series.dropna().values: cast_series = series.astype(dtype) inferred_type = ww.type_system.infer_logical_type(cast_series) - if to_pandas(cast_series).isnull().any(): + if cast_series.isnull().any(): assert isinstance(inferred_type, BooleanNullable) else: assert isinstance(inferred_type, Boolean) @@ -89,8 +61,6 @@ def test_boolean_inference(bools): def test_datetime_inference(datetimes): dtypes = ["object", "string", "datetime64[ns]"] - if _is_spark_series(datetimes[0]): - dtypes = get_spark_dtypes(dtypes) for series in datetimes: for dtype in dtypes: @@ -100,8 +70,6 @@ def test_datetime_inference(datetimes): def test_email_inference(emails): dtypes = ["object", "string"] - if _is_spark_series(emails[0]): - dtypes = get_spark_dtypes(dtypes) for series in emails: for dtype in dtypes: @@ -111,13 +79,8 @@ def test_email_inference(emails): def test_email_inference_failure(bad_emails): dtypes = ["object", "string"] - if _is_spark_series(bad_emails[0]): - dtypes = get_spark_dtypes(dtypes) for series in bad_emails: - if _is_spark_series(series) and isinstance(series.iloc[0], ps.series.Row): - continue - for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert not isinstance(inferred_type, EmailAddress) @@ -125,8 +88,6 @@ def test_email_inference_failure(bad_emails): def test_categorical_inference(categories): dtypes = ["object", "string", "category"] - if _is_spark_series(categories[0]): - dtypes = get_spark_dtypes(dtypes) for ind, series in enumerate(categories): if ind == len(categories) - 1: dtypes = ["string", "category"] @@ -141,8 +102,6 @@ def test_categorical_inference(categories): def test_postal_inference(postal): dtypes = ["category", "string"] for series in postal: - if _is_spark_series(series): - dtypes = get_spark_dtypes(dtypes) for dtype in dtypes: inferred_dtype = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_dtype, PostalCode) @@ -150,8 +109,6 @@ def test_postal_inference(postal): def test_natural_language_inference(natural_language): dtypes = ["object", "string"] - if _is_spark_series(natural_language[0]): - dtypes = get_spark_dtypes(dtypes) for series in natural_language: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) @@ -159,9 +116,9 @@ def test_natural_language_inference(natural_language): @patch("woodwork.type_sys.inference_functions.natural_language_func") -def test_nl_inference_called_on_no_other_matches(nl_mock, pandas_natural_language): +def test_nl_inference_called_on_no_other_matches(nl_mock, natural_language): assert isinstance( - ww.type_system.infer_logical_type(pandas_natural_language[0]), + ww.type_system.infer_logical_type(natural_language[0]), NaturalLanguage, ) new_type_sys = TypeSystem( @@ -170,33 +127,33 @@ def test_nl_inference_called_on_no_other_matches(nl_mock, pandas_natural_languag default_type=DEFAULT_TYPE, ) new_type_sys.inference_functions[NaturalLanguage] = nl_mock - _ = new_type_sys.infer_logical_type(pandas_natural_language[0]) + _ = new_type_sys.infer_logical_type(natural_language[0]) assert nl_mock.called @patch("woodwork.type_sys.inference_functions.natural_language_func") -def test_nl_inference_called_with_unknown_type(nl_mock, pandas_strings): - assert isinstance(ww.type_system.infer_logical_type(pandas_strings[0]), Unknown) +def test_nl_inference_called_with_unknown_type(nl_mock, strings): + assert isinstance(ww.type_system.infer_logical_type(strings[0]), Unknown) new_type_sys = TypeSystem( inference_functions=DEFAULT_INFERENCE_FUNCTIONS, relationships=DEFAULT_RELATIONSHIPS, default_type=DEFAULT_TYPE, ) new_type_sys.inference_functions[NaturalLanguage] = nl_mock - _ = new_type_sys.infer_logical_type(pandas_strings[0]) + _ = new_type_sys.infer_logical_type(strings[0]) assert nl_mock.called @patch("woodwork.type_sys.inference_functions.natural_language_func") -def test_nl_inference_not_called_with_other_matches(nl_mock, pandas_integers): - assert isinstance(ww.type_system.infer_logical_type(pandas_integers[0]), Integer) +def test_nl_inference_not_called_with_other_matches(nl_mock, integers): + assert isinstance(ww.type_system.infer_logical_type(integers[0]), Integer) new_type_sys = TypeSystem( inference_functions=DEFAULT_INFERENCE_FUNCTIONS, relationships=DEFAULT_RELATIONSHIPS, default_type=DEFAULT_TYPE, ) new_type_sys.inference_functions[NaturalLanguage] = nl_mock - _ = new_type_sys.infer_logical_type(pandas_integers[0]) + _ = new_type_sys.infer_logical_type(integers[0]) assert not nl_mock.called @@ -216,8 +173,6 @@ def test_categorical_inference_based_on_dtype(categories_dtype): def test_categorical_integers_inference(integers): with ww.config.with_options(numeric_categorical_threshold=0.5): dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] - if _is_spark_series(integers[0]): - dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) @@ -227,8 +182,6 @@ def test_categorical_integers_inference(integers): def test_categorical_double_inference(doubles): with ww.config.with_options(numeric_categorical_threshold=0.5): dtypes = ["float", "float32", "float64", "float_"] - if _is_spark_series(doubles[0]): - dtypes = get_spark_dtypes(dtypes) for series in doubles: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) @@ -245,8 +198,6 @@ def test_timedelta_inference(timedeltas): def test_unknown_inference(strings): dtypes = ["object", "string"] - if _is_spark_series(strings[0]): - dtypes = get_spark_dtypes(dtypes) for series in strings: for dtype in dtypes: @@ -260,8 +211,6 @@ def test_unknown_inference_all_null(nulls): for ind, series in enumerate(nulls): if ind == len(nulls) - 1: dtypes = ["object", "string", "category"] - if _is_spark_series(nulls[0]): - dtypes = get_spark_dtypes(dtypes) for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) inferred_type.transform(series) @@ -296,8 +245,6 @@ class Integer(LogicalType): type_sys.add_type(Integer, inference_function=inference_fn) dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] - if _is_spark_series(integers[0]): - dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: @@ -321,8 +268,6 @@ def test_inference_randomly_sampled(large_df, type_sys): def test_url_inference(urls): dtypes = ["object", "string"] - if _is_spark_series(urls[0]): - dtypes = get_spark_dtypes(dtypes) for series in urls: for dtype in dtypes: @@ -332,8 +277,6 @@ def test_url_inference(urls): def test_phone_inference(phone): dtypes = ["object", "string"] - if _is_spark_series(phone[0]): - dtypes = get_spark_dtypes(dtypes) for series in phone: for dtype in dtypes: @@ -343,8 +286,6 @@ def test_phone_inference(phone): def test_ip_inference(ip): dtypes = ["object", "string"] - if _is_spark_series(ip[0]): - dtypes = get_spark_dtypes(dtypes) for series in ip: for dtype in dtypes: diff --git a/woodwork/tests/utils/test_accessor_utils.py b/woodwork/tests/utils/test_accessor_utils.py index 28b2a19ba..a0556d497 100644 --- a/woodwork/tests/utils/test_accessor_utils.py +++ b/woodwork/tests/utils/test_accessor_utils.py @@ -3,12 +3,8 @@ import pytest from woodwork.accessor_utils import ( - _is_dask_dataframe, - _is_dask_series, _is_dataframe, _is_series, - _is_spark_dataframe, - _is_spark_series, get_invalid_schema_message, init_series, is_schema_valid, @@ -18,10 +14,7 @@ def test_init_series_valid_conversion_specified_ltype(sample_series): - if _is_spark_series(sample_series): - sample_series = sample_series.astype("str") - else: - sample_series = sample_series.astype("object") + sample_series = sample_series.astype("object") series = init_series(sample_series, logical_type="categorical") assert series is not sample_series @@ -64,10 +57,10 @@ def test_init_series_with_invalid_type(sample_df): init_series(input_) -def test_init_series_with_np_array(sample_series_pandas): - series = init_series(sample_series_pandas.to_numpy()) +def test_init_series_with_np_array(sample_series): + series = init_series(sample_series.to_numpy()) series2 = init_series( - sample_series_pandas, + sample_series, ) # Sample series panda contains ['a','b','c','a'] assert series.equals(series2) assert series.ww.logical_type == series2.ww.logical_type @@ -82,10 +75,7 @@ def test_init_series_with_multidimensional_np_array(): def test_init_series_valid_conversion_inferred_ltype(sample_series): - if _is_spark_series(sample_series): - sample_series = sample_series.astype("str") - else: - sample_series = sample_series.astype("object") + sample_series = sample_series.astype("object") series = init_series(sample_series) assert series is not sample_series @@ -109,10 +99,7 @@ def test_init_series_with_latlong(latlong_df): def test_init_series_all_parameters(sample_series): - if _is_spark_series(sample_series): - sample_series = sample_series.astype("str") - else: - sample_series = sample_series.astype("object") + sample_series = sample_series.astype("object") metadata = {"meta_key": "meta_value"} description = "custom description" @@ -137,16 +124,6 @@ def test_init_series_all_parameters(sample_series): def test_init_series_error_on_invalid_conversion(sample_series): - if _is_dask_series(sample_series): - pytest.xfail( - "Dask type conversion with astype does not fail until compute is called", - ) - if _is_spark_series(sample_series): - pytest.xfail( - "Spark allows this conversion, filling values it cannot convert with NaN " - "and converting dtype to float.", - ) - error_message = ( "Error converting datatype for sample_series from type category to type Int64. " "Please confirm the underlying data is consistent with logical type IntegerNullable." @@ -215,28 +192,23 @@ def test_get_invalid_schema_message_dtype_mismatch(sample_df): == "dtype mismatch for column is_registered between DataFrame dtype, Int64, and BooleanNullable dtype, boolean" ) - # Spark backup dtypes make these checks not relevant - if not _is_spark_dataframe(sample_df): - incorrect_str_dtype_df = schema_df.ww.astype( - {"full_name": "object"}, - ) # wont work for spark - incorrect_categorical_dtype_df = schema_df.ww.astype( - {"age": "string"}, - ) # wont work for spark - assert ( - get_invalid_schema_message(incorrect_str_dtype_df, schema) - == "dtype mismatch for column full_name between DataFrame dtype, object, and PersonFullName dtype, string" - ) - assert ( - get_invalid_schema_message(incorrect_categorical_dtype_df, schema) - == "dtype mismatch for column age between DataFrame dtype, string, and Categorical dtype, category" - ) + incorrect_str_dtype_df = schema_df.ww.astype( + {"full_name": "object"}, + ) + incorrect_categorical_dtype_df = schema_df.ww.astype( + {"age": "string"}, + ) + assert ( + get_invalid_schema_message(incorrect_str_dtype_df, schema) + == "dtype mismatch for column full_name between DataFrame dtype, object, and PersonFullName dtype, string" + ) + assert ( + get_invalid_schema_message(incorrect_categorical_dtype_df, schema) + == "dtype mismatch for column age between DataFrame dtype, string, and Categorical dtype, category" + ) def test_get_invalid_schema_message_index_checks(sample_df): - if not isinstance(sample_df, pd.DataFrame): - pytest.xfail("Index validation not performed for Dask or Spark DataFrames") - schema_df = sample_df.copy() schema_df.ww.init( name="test_schema", @@ -293,23 +265,3 @@ def test_is_schema_valid_false(sample_df): missing_col_df = sample_df.drop(columns={"is_registered"}) assert not is_schema_valid(missing_col_df, schema) - - -def test_is_dask_dataframe(sample_df_dask): - assert _is_dask_dataframe(sample_df_dask) - assert not _is_dask_dataframe(pd.DataFrame()) - - -def test_is_dask_series(sample_series_dask): - assert _is_dask_series(sample_series_dask) - assert not _is_dask_series(pd.Series()) - - -def test_is_spark_dataframe(sample_df_spark): - assert _is_spark_dataframe(sample_df_spark) - assert not _is_dask_dataframe(pd.DataFrame()) - - -def test_is_spark_series(sample_series_spark): - assert _is_spark_series(sample_series_spark) - assert not _is_dask_series(pd.Series()) diff --git a/woodwork/tests/utils/test_concat.py b/woodwork/tests/utils/test_concat.py index b4af4cefd..f5a3a2772 100644 --- a/woodwork/tests/utils/test_concat.py +++ b/woodwork/tests/utils/test_concat.py @@ -4,7 +4,6 @@ import pytest import woodwork as ww -from woodwork.accessor_utils import _is_dask_dataframe, _is_spark_dataframe from woodwork.logical_types import ( BooleanNullable, Categorical, @@ -12,7 +11,6 @@ Integer, IntegerNullable, ) -from woodwork.tests.testing_utils import to_pandas from woodwork.utils import concat_columns @@ -54,13 +52,13 @@ def test_concat_cols_ww_dfs(sample_df): assert "test_tag" in combined_df.ww.semantic_tags["age"] assert combined_df.ww.metadata == {"created_by": "user0"} - pandas_combined_df = pd.concat( - [to_pandas(df1), to_pandas(df2)], + combined_df = pd.concat( + [df1, df2], axis=1, join="outer", ignore_index=False, ) - assert to_pandas(combined_df).equals(pandas_combined_df) + assert combined_df.equals(combined_df) def test_concat_cols_uninit_dfs(sample_df): @@ -102,13 +100,13 @@ def test_concat_cols_uninit_dfs(sample_df): df1.ww.init() df2.ww.init() - pandas_combined_df = pd.concat( - [to_pandas(df1), to_pandas(df2)], + combined_df = pd.concat( + [df1, df2], axis=1, join="outer", ignore_index=False, ) - assert to_pandas(combined_df).equals(pandas_combined_df) + assert combined_df.equals(combined_df) def test_concat_cols_combo_dfs(sample_df): @@ -145,13 +143,13 @@ def test_concat_cols_combo_dfs(sample_df): df1.ww.init() df2.ww.init() - pandas_combined_df = pd.concat( - [to_pandas(df1), to_pandas(df2)], + combined_df = pd.concat( + [df1, df2], axis=1, join="outer", ignore_index=False, ) - assert to_pandas(combined_df).equals(pandas_combined_df) + assert combined_df.equals(combined_df) def test_concat_cols_with_series(sample_df): @@ -167,13 +165,13 @@ def test_concat_cols_with_series(sample_df): df.ww.init() s1.ww.init() s2.ww.init() - pandas_combined_df = pd.concat( - [to_pandas(df), to_pandas(s1), to_pandas(s2)], + combined_df = pd.concat( + [df, s1, s2], axis=1, join="outer", ignore_index=False, ) - assert to_pandas(combined_df).equals(pandas_combined_df) + assert combined_df.equals(combined_df) def test_concat_cols_with_conflicting_ww_indexes(sample_df): @@ -309,7 +307,7 @@ def test_concat_cols_with_duplicate_ww_indexes(sample_df): df2.ww.pop("signup_date") # Because underlying index is set, this won't change concat operation - pd.testing.assert_index_equal(to_pandas(df1.index), to_pandas(df2.index)) + pd.testing.assert_index_equal(df1.index, df2.index) combined_df = concat_columns([df1, df2]) assert combined_df.ww.index == "id" @@ -403,10 +401,6 @@ def test_concat_cols_validate_schema(mock_validate_accessor_params, sample_df): def test_concat_cols_mismatched_index_adds_single_nan(sample_df): - if _is_dask_dataframe(sample_df): - pytest.skip( - "Test is currently broken with Dask - can't perform concat operation in `concat_columns` - needs investigation", - ) # If the dtype can handle nans, it won't change sample_df.ww.init(logical_types={"id": "IntegerNullable"}) @@ -417,12 +411,12 @@ def test_concat_cols_mismatched_index_adds_single_nan(sample_df): assert len(combined_df) == 4 -def test_concat_cols_mismatched_index_adds_multiple_nans(sample_df_pandas): +def test_concat_cols_mismatched_index_adds_multiple_nans(sample_df): # Only pandas checks for index uniqueness - sample_df_pandas.ww.init(index="id", logical_types={"id": "IntegerNullable"}) + sample_df.ww.init(index="id", logical_types={"id": "IntegerNullable"}) - df1 = sample_df_pandas.ww.loc[[0, 1], ["id", "full_name"]] - df2 = sample_df_pandas.ww.loc[[2, 3], ["signup_date", "email"]] + df1 = sample_df.ww.loc[[0, 1], ["id", "full_name"]] + df2 = sample_df.ww.loc[[2, 3], ["signup_date", "email"]] error = "Index column must be unique" with pytest.raises(IndexError, match=error): @@ -490,7 +484,7 @@ def test_concat_cols_all_series(sample_df): def test_concat_cols_row_order(sample_df): sample_df.ww.init(index="id") - pd.testing.assert_index_equal(to_pandas(sample_df.index), pd.Index([0, 1, 2, 3])) + pd.testing.assert_index_equal(sample_df.index, pd.Index([0, 1, 2, 3])) df1 = sample_df.ww.loc[:, ["id", "full_name"]] df2 = sample_df.ww.loc[[2, 3, 0, 1], ["email", "phone_number"]] @@ -516,14 +510,7 @@ def test_concat_cols_row_order(sample_df): assert sample_df.ww == combined_df.ww - # spark does not preserve index order in the same way - if _is_spark_dataframe(sample_df): - pd.testing.assert_index_equal( - to_pandas(combined_df.index), - pd.Index([0, 1, 2, 3]), - ) - else: - pd.testing.assert_frame_equal(to_pandas(sample_df), to_pandas(combined_df)) + pd.testing.assert_frame_equal(sample_df, combined_df) def test_concat_empty_list(): @@ -563,10 +550,6 @@ def test_concat_shorter_null_int(sample_df, nullable_type): # Purposefully create a dataframe with a non-nullable integer column # that's shorter than the one to concat with - if _is_dask_dataframe(sample_df): - pytest.skip( - "Slicing dataframe with respect to rows is not supported with Dask input", - ) df2 = sample_df.ww[ [ nullable_type, diff --git a/woodwork/tests/utils/test_read_file.py b/woodwork/tests/utils/test_read_file.py index 561c7e586..2e774f89e 100644 --- a/woodwork/tests/utils/test_read_file.py +++ b/woodwork/tests/utils/test_read_file.py @@ -9,9 +9,9 @@ from woodwork.serializers.orc_serializer import save_orc_file -def test_read_file_errors_no_content_type(sample_df_pandas, tmpdir): +def test_read_file_errors_no_content_type(sample_df, tmpdir): filepath = os.path.join(tmpdir, "sample") - sample_df_pandas.to_csv(filepath, index=False) + sample_df.to_csv(filepath, index=False) no_type_error = ( "Content type could not be inferred. Please specify content_type and try again." @@ -20,9 +20,9 @@ def test_read_file_errors_no_content_type(sample_df_pandas, tmpdir): ww.read_file(filepath=filepath) -def test_read_file_errors_unsupported(sample_df_pandas, tmpdir): +def test_read_file_errors_unsupported(sample_df, tmpdir): filepath = os.path.join(tmpdir, "sample") - sample_df_pandas.to_feather(filepath) + sample_df.to_feather(filepath) content_type = "notacontenttype" not_supported_error = ( @@ -35,11 +35,11 @@ def test_read_file_errors_unsupported(sample_df_pandas, tmpdir): @patch("woodwork.table_accessor._validate_accessor_params") def test_read_file_validation_control( mock_validate_accessor_params, - sample_df_pandas, + sample_df, tmpdir, ): filepath = os.path.join(tmpdir, "sample.csv") - sample_df_pandas.to_csv(filepath, index=False) + sample_df.to_csv(filepath, index=False) assert not mock_validate_accessor_params.called ww.read_file(filepath=filepath, validate=False) @@ -150,7 +150,7 @@ def test_read_file_validation_control( ], ) def test_read_file( - sample_df_pandas, + sample_df, tmpdir, filepath, exportfn, @@ -160,17 +160,17 @@ def test_read_file( filepath = os.path.join(tmpdir, filepath) func, func_kwargs = exportfn if isinstance(func, str): - getattr(sample_df_pandas, func)(filepath, **func_kwargs) + getattr(sample_df, func)(filepath, **func_kwargs) else: # Call save_orc_file to save orc data since pandas does not have a to_orc method - func(sample_df_pandas, filepath, **func_kwargs) + func(sample_df, filepath, **func_kwargs) df = ww.read_file(filepath=filepath, **kwargs) assert isinstance(df.ww.schema, ww.table_schema.TableSchema) - schema_df = sample_df_pandas.copy() + schema_df = sample_df.copy() if pandas_nullable_fix: # pandas does not read data into nullable types currently from csv or orc, - # so the types in df will be different than the types inferred from sample_df_pandas + # so the types in df will be different than the types inferred from sample_df # which uses the nullable types schema_df = schema_df.astype( { diff --git a/woodwork/tests/utils/test_utils.py b/woodwork/tests/utils/test_utils.py index 4caac8bb1..c2f47b304 100644 --- a/woodwork/tests/utils/test_utils.py +++ b/woodwork/tests/utils/test_utils.py @@ -25,7 +25,6 @@ PostalCode, SubRegionCode, ) -from woodwork.tests.testing_utils import concat_dataframe_or_series from woodwork.type_sys.type_system import DEFAULT_INFERENCE_FUNCTIONS from woodwork.type_sys.utils import ( _get_specified_ltype_params, @@ -55,9 +54,6 @@ import_or_raise, ) -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - def test_camel_to_snake(): test_items = { @@ -300,13 +296,9 @@ def test_reformat_to_latlong_errors(test_input, error_msg): ("", np.nan), ], ) -@pytest.mark.parametrize("is_spark", [True, False]) -def test_reformat_to_latlong(test_input, expected, is_spark): +def test_reformat_to_latlong(test_input, expected): if isinstance(expected, (list, tuple)): - if is_spark: - assert _reformat_to_latlong(test_input, is_spark) == list(expected) - else: - assert _reformat_to_latlong(test_input, is_spark) == expected + assert _reformat_to_latlong(test_input) == expected else: assert _reformat_to_latlong(test_input) is expected @@ -370,30 +362,6 @@ def test_is_valid_latlong_value(test_input, expected): assert _is_valid_latlong_value(test_input) == expected -@pytest.mark.parametrize( - "test_input,expected", - [ - ([1.0, 2.0], True), - ([1.0, np.nan], True), - ([np.nan, 2.0], True), - ([np.nan, np.nan], True), - (np.nan, True), - (None, True), - (pd.NA, False), - (2.0, False), - ([2.0], False), - ([None, None], True), - ("None", False), - ((1.0, 2.0), False), - ((pd.NA, pd.NA), False), - (("a", 2.0), False), - ((1.0, 2.0, 3.0), False), - ], -) -def test_is_valid_latlong_value_spark(test_input, expected): - assert _is_valid_latlong_value(test_input, is_spark=True) == expected - - def test_is_valid_latlong_series(): valid_series = pd.Series([(1.0, 2.0), (3.0, 4.0)]) invalid_series = pd.Series([(1.0, 2.0), (3.0, "4.0")]) @@ -556,12 +524,6 @@ def test_infer_datetime_format_all_null(): for pd_series in missing_data: assert _infer_datetime_format(pd_series) is None - if dd: - dd_series = dd.from_pandas(pd_series, npartitions=2) - assert _infer_datetime_format(dd_series) is None - if ps: - ks_series = ps.from_pandas(pd_series) - assert _infer_datetime_format(ks_series) is None def test_is_categorical() -> None: @@ -644,53 +606,3 @@ def test_callback_caller_no_callback(): caller.update(1) assert caller.current_progress == 0 - - -def test_concat_dataframe_or_series_with_series(): - """Tests whether series are correctly concatenated""" - pandas_series = pd.Series([1, 2, 3]) - - assert len(concat_dataframe_or_series(pandas_series, pandas_series)) == 2 * len( - pandas_series, - ) - - if dd: - dask_series = dd.from_pandas(pandas_series, npartitions=1) - assert len(concat_dataframe_or_series(dask_series, dask_series)) == 2 * len( - dask_series, - ) - if ps: - spark_series = ps.Series(data=[1, 2, 3]) - assert len(concat_dataframe_or_series(spark_series, spark_series)) == 2 * len( - spark_series, - ) - - -def test_concat_dataframe_or_series_with_series_with_dataframe(): - """Tests whether dataframes are correctly concatenated""" - d = {"col1": [1, 2], "col2": [3, 4]} - df = pd.DataFrame(data=d) - - assert len(concat_dataframe_or_series(df, df)) == 2 * len( - df, - ) - - if dd: - dask_df = dd.from_pandas(df, npartitions=1) - assert len(concat_dataframe_or_series(dask_df, dask_df)) == 2 * len( - dask_df, - ) - if ps: - spark_df = ps.from_pandas(df) - assert len(concat_dataframe_or_series(spark_df, spark_df)) == 2 * len( - spark_df, - ) - - -def tests_concat_dataframe_or_series_concatenates_in_correct_order(): - """Tests to_add argument is appropriately added to end of base argument""" - base = pd.Series([1, 2, 3]) - to_add = pd.Series([4, 5, 6]) - concatenated_object = concat_dataframe_or_series(base, to_add) - assert concatenated_object.head(3).equals(pd.Series([1, 2, 3])) - assert concatenated_object.tail(3).equals(pd.Series([4, 5, 6])) diff --git a/woodwork/type_sys/type_system.py b/woodwork/type_sys/type_system.py index 59320b0cb..c3cc0c2db 100644 --- a/woodwork/type_sys/type_system.py +++ b/woodwork/type_sys/type_system.py @@ -1,8 +1,5 @@ from typing import Callable -import pandas as pd - -from woodwork.accessor_utils import _is_dask_series, _is_spark_series from woodwork.logical_types import ( URL, Address, @@ -353,38 +350,12 @@ def get_random_sample(series_, **kwargs): "replace": False, "random_state": 42, } - if isinstance(series, pd.Series): - # Special case for series with no valid values - if series.count() == 0: - return Unknown() - kw_args_sampling["n"] = INFERENCE_SAMPLE_SIZE - series = get_random_sample(series, **kw_args_sampling) - else: - # Dask and Spark don't accept the n argument - - # prevent division by zero error - series_len = len(series) - if not series_len: - return Unknown() - kw_args_sampling["frac"] = INFERENCE_SAMPLE_SIZE / series_len - if _is_dask_series(series): - series = get_random_sample( - series.head(series_len, npartitions=-1), - **kw_args_sampling, - ) - elif _is_spark_series(series): - series = get_random_sample(series, **kw_args_sampling) - series = series.to_pandas() - else: - raise ValueError( - f"Unsupported series type `{type(series)}`", - ) # pragma: no cover - - # For dask or spark collections, unknown type special case comes - # *after* head calls to avoid evaluating a potentially large - # dataset - if series.count() == 0: - return Unknown() + + # Special case for series with no valid values + if series.count() == 0: + return Unknown() + kw_args_sampling["n"] = INFERENCE_SAMPLE_SIZE + series = get_random_sample(series, **kw_args_sampling) def get_inference_matches(types_to_check, series, type_matches=[]): # Since NaturalLanguage isn't inferred by default, make sure to check diff --git a/woodwork/type_sys/utils.py b/woodwork/type_sys/utils.py index 90b761135..9cedc9ab8 100644 --- a/woodwork/type_sys/utils.py +++ b/woodwork/type_sys/utils.py @@ -2,19 +2,12 @@ from dateutil.parser import ParserError import woodwork as ww -from woodwork.accessor_utils import _is_dask_series, _is_spark_series -from woodwork.utils import import_or_none - -ps = import_or_none("pyspark.pandas") -dd = import_or_none("dask.dataframe") def col_is_datetime(col, datetime_format=None): """Determine if a dataframe column contains datetime values or not. Returns True if column contains datetimes, False if not. Optionally specify the datetime format string for the column. Will not infer numeric data as datetime.""" - if _is_spark_series(col): - col = col.to_pandas() if pd.api.types.is_datetime64_any_dtype(col): return True @@ -51,11 +44,6 @@ def col_is_datetime(col, datetime_format=None): def _is_numeric_series(series, logical_type): """Determines whether a series will be considered numeric for the purposes of determining if it can be a time_index.""" - if _is_spark_series(series): - series = series.to_pandas() - if _is_dask_series(series): - series = series.get_partition(0).compute() - # If column can't be made to be numeric, don't bother checking Logical Type try: pd.to_numeric(series, errors="raise") diff --git a/woodwork/typing.py b/woodwork/typing.py index 96aa9b094..da5367062 100644 --- a/woodwork/typing.py +++ b/woodwork/typing.py @@ -1,16 +1,7 @@ -from typing import Dict, Hashable, Union +from typing import Dict, Hashable import pandas as pd -from woodwork.utils import import_or_none - -dd = import_or_none("dask.dataframe") -ps = import_or_none("pyspark.pandas") - ColumnName = Hashable UseStandardTagsDict = Dict[ColumnName, bool] AnyDataFrame = pd.DataFrame -if dd: - AnyDataFrame = Union[AnyDataFrame, dd.DataFrame] -if ps: - AnyDataFrame = Union[AnyDataFrame, ps.DataFrame] diff --git a/woodwork/utils.py b/woodwork/utils.py index 3647b145c..53da5e80e 100644 --- a/woodwork/utils.py +++ b/woodwork/utils.py @@ -219,7 +219,7 @@ def _is_url(string): return "http" in string -def _reformat_to_latlong(latlong, is_spark=False): +def _reformat_to_latlong(latlong): """ Accepts 2-tuple like values, or a single NaN like value. NaN like values are replaced with np.nan. @@ -244,8 +244,6 @@ def _reformat_to_latlong(latlong, is_spark=False): ) latlong = (latitude, longitude) - if is_spark: - latlong = list(latlong) return latlong if _is_nan(latlong): @@ -277,26 +275,18 @@ def _coerce_to_float(val): def _is_valid_latlong_series(series): """Returns True if all elements in the series contain properly formatted LatLong values, otherwise returns False""" - if ww.accessor_utils._is_dask_series(series): - series = series.get_partition(0).compute() - if ww.accessor_utils._is_spark_series(series): - series = series.to_pandas() - is_spark = True - else: - is_spark = False - if series.apply(_is_valid_latlong_value, args=(is_spark,)).all(): + if series.apply(_is_valid_latlong_value).all(): return True return False -def _is_valid_latlong_value(val, is_spark=False): - """Returns True if the value provided is a properly formatted LatLong value for a - pandas, Dask or Spark Series, otherwise returns False.""" +def _is_valid_latlong_value(val): + """Returns True if the value provided is a properly formatted LatLong value, otherwise returns False.""" if isinstance(val, (list, tuple)): if len(val) != 2: return False - if not isinstance(val, list if is_spark else tuple): + if not isinstance(val, tuple): return False latitude, longitude = val @@ -315,9 +305,6 @@ def _is_valid_latlong_value(val, is_spark=False): else: return _is_valid_latlong_value(val) - if is_spark and val is None: - return True - return False @@ -521,31 +508,14 @@ def concat_columns(objs, validate_schema=True): col_names_seen.add(name) - # Perform concatenation with the correct library - obj = objs[0] - dd = import_or_none("dask.dataframe") - ps = import_or_none("pyspark.pandas") - - lib = pd - if ww.accessor_utils._is_spark_dataframe(obj) or ww.accessor_utils._is_spark_series( - obj, - ): - lib = ps - elif ww.accessor_utils._is_dask_dataframe(obj) or ww.accessor_utils._is_dask_series( - obj, - ): - lib = dd - - combined_df = lib.concat(objs, axis=1, join="outer") + combined_df = pd.concat(objs, axis=1, join="outer") # The lib.concat breaks the woodwork schema for dataframes with different shapes # or mismatched indices. mask = combined_df.isnull().any() null_cols = mask[mask].index - if not ww.accessor_utils._is_dask_dataframe(combined_df): - null_cols = null_cols.to_numpy() - else: - null_cols = list(null_cols) + null_cols = null_cols.to_numpy() + for null_col in null_cols: if null_col in logical_types and isinstance( logical_types[null_col], @@ -630,13 +600,6 @@ def _infer_datetime_format(dates, n=100): """ dates_no_null = dates.dropna() - ps = import_or_none("pyspark.pandas") - dd = import_or_none("dask.dataframe") - if ps and isinstance(dates_no_null, ps.series.Series): - dates_no_null = dates_no_null.to_pandas() - if dd and isinstance(dates_no_null, dd.Series): - dates_no_null = dates_no_null.compute() - random_n = dates_no_null.sample(min(n, len(dates_no_null)), random_state=42) if len(random_n) == 0: