From 3d4fe4982bd7ae8abe1d63e44ef793b3ea1ca18c Mon Sep 17 00:00:00 2001 From: Mathias Claassen Date: Wed, 24 Apr 2024 16:23:54 -0300 Subject: [PATCH 1/2] Clean up docs --- .devcontainer/Dockerfile | 12 ------------ .devcontainer/devcontainer.json | 33 --------------------------------- .github/workflows/CI.yml | 10 ---------- .vscode/settings.json | 22 ++-------------------- CODE_OF_CONDUCT.md | 9 --------- SUPPORT.md | 25 ------------------------- 6 files changed, 2 insertions(+), 109 deletions(-) delete mode 100644 .devcontainer/Dockerfile delete mode 100644 .devcontainer/devcontainer.json delete mode 100644 CODE_OF_CONDUCT.md delete mode 100644 SUPPORT.md diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index 35bb85f..0000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM mcr.microsoft.com/devcontainers/python:3 - -RUN python -m pip install --upgrade pip \ - && python -m pip install 'flit>=3.8.0' - -ENV FLIT_ROOT_INSTALL=1 - -COPY pyproject.toml . -RUN touch README.md \ - && mkdir -p src/tf_tabular \ - && python -m flit install --only-deps --deps develop \ - && rm -r pyproject.toml README.md src diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index ec9a80b..0000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,33 +0,0 @@ -// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: -// https://github.com/microsoft/vscode-dev-containers/tree/v0.222.0/containers/python-3-miniconda -{ - "name": "Python Environment", - "build": { - "dockerfile": "Dockerfile", - "context": ".." - }, - "customizations": { - "vscode": { - "extensions": [ - "editorconfig.editorconfig", - "github.vscode-pull-request-github", - "ms-azuretools.vscode-docker", - "ms-python.python", - "ms-python.flake8", - "ms-python.black-formatter", - "bungcip.better-toml", - "GitHub.copilot" - ], - "settings": { - "python.defaultInterpreterPath": "/usr/local/bin/python", - "black-formatter.path": [ - "/usr/local/py-utils/bin/black" - ], - "flake8.path": [ - "/usr/local/py-utils/bin/flake8" - ] - } - } - }, - "onCreateCommand": "pre-commit install-hooks" -} \ No newline at end of file diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 19567e6..5e1e7e4 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -9,11 +9,6 @@ on: workflow_dispatch: jobs: - # validation: - # uses: microsoft/action-python/.github/workflows/validation.yml@0.7.2 - # with: - # workdir: '.' - build: runs-on: ubuntu-latest steps: @@ -33,8 +28,3 @@ jobs: run: ruff format --check . - name: Run Mypy run: mypy --ignore-missing-imports . - # publish: - # uses: microsoft/action-python/.github/workflows/publish.yml@0.7.2 - # secrets: - # PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - # TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} diff --git a/.vscode/settings.json b/.vscode/settings.json index 5d1a32a..615979d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,27 +3,9 @@ "editor.formatOnPaste": true, "files.trimTrailingWhitespace": true, "files.autoSave": "onFocusChange", - "git.autofetch": true, - "[jsonc]": { - "editor.defaultFormatter": "vscode.json-language-features" - }, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter" + "editor.defaultFormatter": "charliermarsh.ruff" }, - "python.defaultInterpreterPath": "/usr/local/bin/python", - "python.formatting.provider": "black", "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, - "pylint.args": [ - "--rcfile=pyproject.toml" - ], - "black-formatter.args": [ - "--config=pyproject.toml" - ], - "flake8.args": [ - "--toml-config=pyproject.toml" - ], - "isort.args": [ - "--settings-path=pyproject.toml" - ] -} +} \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index f9ba8cf..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,9 +0,0 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/SUPPORT.md b/SUPPORT.md deleted file mode 100644 index bcebadb..0000000 --- a/SUPPORT.md +++ /dev/null @@ -1,25 +0,0 @@ -# TODO: The maintainer of this repo has not yet edited this file - -**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? - -- **No CSS support:** Fill out this template with information about how to file issues and get help. -- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). -- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. - -*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* - -# Support - -## How to file issues and get help - -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. - -For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE -FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER -CHANNEL. WHERE WILL YOU HELP PEOPLE?**. - -## Microsoft Support Policy - -Support for this **PROJECT or PRODUCT** is limited to the resources listed above. From e541b4efcce4841ac2b5f33f34aab268683e80c7 Mon Sep 17 00:00:00 2001 From: Mathias Claassen Date: Thu, 25 Apr 2024 14:22:55 -0300 Subject: [PATCH 2/2] Create first version of Readme --- README.md | 80 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index f753b19..91a3edf 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,64 @@ # TF Tabular -### Feature Overview -* Create input layers based on lists of columns -* No model building or training: Build whatever you want on top -* Support custom embeddings -* Support attention for mixing sequence layers -* Support multi-hot categoricals -* Support computing vocab and normalization params? +TF Tabular is a project aimed at simplifying the process of handling tabular data in TensorFlow. It provides utilities for building models on top of numeric, categorical, multihot, and sequential data types. +## Features -### Competitor analysis -* DeepTables: - * This is for TensorFlow - * Broader scope: Includes model building and training -* Pytorch tabular: - * Only Pytorch - * Broader scope: Includes model building and training - * Not focused on recommenders (no support for multi-hot and sequence columns https://github.com/manujosephv/pytorch_tabular/issues/174) +- Create input layers based on lists of columns +- Support custom embeddings +- Support attention for mixing sequence layers +- Support multi-hot categoricals +- No model building or training: Build whatever you want on top -## Project Organization +## Installation -- `.github/workflows`: Contains GitHub Actions used for building, testing, and publishing. -- `.devcontainer/Dockerfile`: Contains Dockerfile to build a development container for VSCode with all the necessary extensions for Python development installed. -- `.devcontainer/devcontainer.json`: Contains the configuration for the development container for VSCode, including the Docker image to use, any additional VSCode extensions to install, and whether or not to mount the project directory into the container. -- `.vscode/settings.json`: Contains VSCode settings specific to the project, such as the Python interpreter to use and the maximum line length for auto-formatting. -- `src`: Place new source code here. -- `tests`: Contains Python-based test cases to validate source code. -- `pyproject.toml`: Contains metadata about the project and configurations for additional tools used to format, lint, type-check, and analyze Python code. +To get started with TF Tabular, you will need to install it using pip: -### `pyproject.toml` +```sh +pip install tf-tabular +``` -The pyproject.toml file is a centralized configuration file for modern Python projects. It streamlines the development process by managing project metadata, dependencies, and development tool configurations in a single, structured file. This approach ensures consistency and maintainability, simplifying project setup and enabling developers to focus on writing quality code. Key components include project metadata, required and optional dependencies, development tool configurations (e.g., linters, formatters, and test runners), and build system specifications. +## Usage -In this particular pyproject.toml file, the [build-system] section specifies that the Flit package should be used to build the project. The [project] section provides metadata about the project, such as the name, description, authors, and classifiers. The [project.optional-dependencies] section lists optional dependencies, like pyspark, while the [project.urls] section supplies URLs for project documentation, source code, and issue tracking. +Here is a basic example of how to use TF Tabular: -The file also contains various configuration sections for different tools, including bandit, black, coverage, flake8, pyright, pytest, tox, and pylint. These sections specify settings for each tool, such as the maximum line length for flake8 and the minimum code coverage percentage for coverage. +```python +from tf_tabular.builder import InputBuilder +# Define columns to use and specify additional parameters: +categoricals = ['Pclass', 'no_cabin'] +numericals = ['Age', 'Fare'] +# .... -## TODO: -* Parse dataset to separate numeric vs categoricals, multihots and sequencials \ No newline at end of file +# Build model: +input_builder = InputBuilder() +input_builder.add_inputs_list(categoricals=categoricals, + numericals=numericals, + normalization_params=norm_params, + vocabs=vocabs, + embedding_dims=embedding_dims) +inputs, output = input_builder.build_input_layers() +output = Dense(1, activation='sigmoid')(output) + +model = Model(inputs=inputs, outputs=output) +``` + + + + +Look at the examples folder for more complete examples. + +## Contributing +Contributions to TF Tabular are welcome. If you have a feature you'd like to add, or a bug you'd like to fix, please open a pull request. + +## Roadmap: +This is a list of possible features to be added in the future depending on need and interest expressed by the community. + +- [ ] Parse dataset to separate numeric vs categoricals, multihots and sequencials +- [ ] Implement other types of normalization +- [ ] Support computing vocab and normalization params? +- [ ] Improve documentation and provide more usage examples + +## License +TF Tabular is licensed under the MIT License. See the LICENSE file for more details.