From 5bd98f96bcba319403023a7a329dd7e69e044513 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 6 Sep 2023 11:18:11 -0700 Subject: [PATCH 1/3] removed python stuff since it was moved to its own repo --- .github/workflows/build_cmake.yml | 8 - .github/workflows/build_wheels.yml | 106 ----- LICENSE | 76 ---- MANIFEST.in | 39 -- pyproject.toml | 23 -- python/CMakeLists.txt | 87 ---- python/README.md | 101 ----- python/datasketches/DensityWrapper.py | 87 ---- python/datasketches/KernelFunction.py | 35 -- python/datasketches/PySerDe.py | 110 ----- python/datasketches/TuplePolicy.py | 77 ---- python/datasketches/TupleWrapper.py | 208 ---------- python/datasketches/__init__.py | 38 -- python/include/kernel_function.hpp | 98 ----- python/include/py_object_lt.hpp | 37 -- python/include/py_object_ostream.hpp | 48 --- python/include/py_serde.hpp | 113 ------ python/include/quantile_conditional.hpp | 104 ----- python/include/tuple_policy.hpp | 136 ------- python/jupyter/CPCSketch.ipynb | 345 ---------------- python/jupyter/FrequentItemsSketch.ipynb | 354 ---------------- python/jupyter/HLLSketch.ipynb | 346 ---------------- python/jupyter/KLLSketch.ipynb | 463 --------------------- python/jupyter/ThetaSketchNotebook.ipynb | 403 ------------------- python/pybind11Path.cmd | 21 - python/src/__init__.py | 18 - python/src/count_wrapper.cpp | 101 ----- python/src/cpc_wrapper.cpp | 78 ---- python/src/datasketches.cpp | 58 --- python/src/density_wrapper.cpp | 95 ----- python/src/fi_wrapper.cpp | 182 --------- python/src/hll_wrapper.cpp | 126 ------ python/src/kll_wrapper.cpp | 158 -------- python/src/ks_wrapper.cpp | 68 ---- python/src/py_serde.cpp | 112 ------ python/src/quantiles_wrapper.cpp | 155 ------- python/src/req_wrapper.cpp | 154 ------- python/src/theta_wrapper.cpp | 168 -------- python/src/tuple_wrapper.cpp | 216 ---------- python/src/vector_of_kll.cpp | 490 ----------------------- python/src/vo_wrapper.cpp | 173 -------- python/tests/__init__.py | 16 - python/tests/count_min_test.py | 86 ---- python/tests/cpc_test.py | 69 ---- python/tests/density_test.py | 93 ----- python/tests/fi_test.py | 149 ------- python/tests/hll_test.py | 129 ------ python/tests/kll_test.py | 159 -------- python/tests/quantiles_test.py | 160 -------- python/tests/req_test.py | 159 -------- python/tests/theta_test.py | 156 -------- python/tests/tuple_test.py | 213 ---------- python/tests/vector_of_kll_test.py | 148 ------- python/tests/vo_test.py | 132 ------ setup.py | 110 ----- tox.ini | 26 -- 56 files changed, 7620 deletions(-) delete mode 100644 .github/workflows/build_wheels.yml delete mode 100644 MANIFEST.in delete mode 100644 pyproject.toml delete mode 100644 python/CMakeLists.txt delete mode 100644 python/README.md delete mode 100644 python/datasketches/DensityWrapper.py delete mode 100644 python/datasketches/KernelFunction.py delete mode 100644 python/datasketches/PySerDe.py delete mode 100644 python/datasketches/TuplePolicy.py delete mode 100644 python/datasketches/TupleWrapper.py delete mode 100644 python/datasketches/__init__.py delete mode 100644 python/include/kernel_function.hpp delete mode 100644 python/include/py_object_lt.hpp delete mode 100644 python/include/py_object_ostream.hpp delete mode 100644 python/include/py_serde.hpp delete mode 100644 python/include/quantile_conditional.hpp delete mode 100644 python/include/tuple_policy.hpp delete mode 100644 python/jupyter/CPCSketch.ipynb delete mode 100644 python/jupyter/FrequentItemsSketch.ipynb delete mode 100644 python/jupyter/HLLSketch.ipynb delete mode 100644 python/jupyter/KLLSketch.ipynb delete mode 100644 python/jupyter/ThetaSketchNotebook.ipynb delete mode 100644 python/pybind11Path.cmd delete mode 100644 python/src/__init__.py delete mode 100644 python/src/count_wrapper.cpp delete mode 100644 python/src/cpc_wrapper.cpp delete mode 100644 python/src/datasketches.cpp delete mode 100644 python/src/density_wrapper.cpp delete mode 100644 python/src/fi_wrapper.cpp delete mode 100644 python/src/hll_wrapper.cpp delete mode 100644 python/src/kll_wrapper.cpp delete mode 100644 python/src/ks_wrapper.cpp delete mode 100644 python/src/py_serde.cpp delete mode 100644 python/src/quantiles_wrapper.cpp delete mode 100644 python/src/req_wrapper.cpp delete mode 100644 python/src/theta_wrapper.cpp delete mode 100644 python/src/tuple_wrapper.cpp delete mode 100644 python/src/vector_of_kll.cpp delete mode 100644 python/src/vo_wrapper.cpp delete mode 100644 python/tests/__init__.py delete mode 100644 python/tests/count_min_test.py delete mode 100644 python/tests/cpc_test.py delete mode 100644 python/tests/density_test.py delete mode 100644 python/tests/fi_test.py delete mode 100644 python/tests/hll_test.py delete mode 100644 python/tests/kll_test.py delete mode 100644 python/tests/quantiles_test.py delete mode 100644 python/tests/req_test.py delete mode 100644 python/tests/theta_test.py delete mode 100644 python/tests/tuple_test.py delete mode 100644 python/tests/vector_of_kll_test.py delete mode 100644 python/tests/vo_test.py delete mode 100644 setup.py delete mode 100644 tox.ini diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index 8e4c7cb7..9e9807df 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -50,11 +50,3 @@ jobs: run: cmake --build build --config Release - name: Run C++ tests run: cmake --build build --config Release --target ${{ matrix.config.test_target }} - - name: Set up Python 3.x - uses: actions/setup-python@v4 - with: - python-version: '3.8' # 3.x grabs latest minor version of python3, but 3.9 not fully supported yet - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel tox - - name: Build and run Python tests - run: python -m tox diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml deleted file mode 100644 index 28d12a0b..00000000 --- a/.github/workflows/build_wheels.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: Build Python Wheels - -on: - # allow manual runs - workflow_dispatch: - - # run when we tag a release - #release: - # types: - # - "created" - -env: - BUILD_TYPE: Release - -jobs: - build_sdist: - name: Source distribution - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - name: Install Python - with: - python-version: '3.x' - - - name: Install build package - run: python -m pip install build --user - - - name: Build sdist - run: python -m build --sdist --outdir dist - - - uses: actions/upload-artifact@v3 - with: - path: dist/*.tar.gz - - build_wheels: - name: ${{ matrix.config.name }} - runs-on: ${{ matrix.config.os }} - strategy: - matrix: - config: - - { - name: "MacOS Latest (Intel)", - os: macos-latest, - cibw-arch: macosx_x86_64 - } - - { - name: "MacOS Latest (Apple Silicon)", - os: macos-latest, - cibw-arch: macosx_arm64 - } - - { - name: "Ubuntu Latest (x86_64)", - os: ubuntu-latest, - cibw-arch: manylinux_x86_64 - } - - { - name: "Ubuntu Latest (ARM64)", - os: ubuntu-latest, - cibw-arch: manylinux_aarch64, - use-qemu: true - } - - { - name: "Ubuntu Latest (i686)", - os: ubuntu-latest, - cibw-arch: manylinux_i686 - } - - { - name: "Windows Latest", - os: windows-latest, - cibw-arch: win_amd64 - } - - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Set up Python 3.x - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Set up QEMU for linux/arm64 builds - if: runner.os == 'Linux' && matrix.config.use-qemu == true - uses: docker/setup-qemu-action@v2 - with: - platforms: arm64 - - - name: Install Python dependencies - run: python -m pip install cibuildwheel==2.12.0 - - - name: Build wheels - run: python -m cibuildwheel --output-dir dist - env: - CIBW_ARCHS_LINUX: "auto aarch64" - CIBW_ARCHS_MACOS: "x86_64 arm64" - CIBW_ENVIRONMENT_MACOS: CMAKE_OSX_ARCHITECTURES=${{ matrix.config.cibw-arch == 'macosx_x86_64' && 'x86_64' || matrix.config.cibw-arch == 'macosx_arm64' && 'arm64' || '' }} - CIBW_BUILD: "*-${{ matrix.config.cibw-arch }}" - CIBW_BEFORE_BUILD_LINUX: "yum remove -y cmake" - CIBW_BEFORE_BUILD: "python -m pip install cmake>=3.18" - CIBW_SKIP: "*-win32 pp*-aarch64 pp*-macosx" - - - uses: actions/upload-artifact@v3 - with: - path: ./dist/*.whl diff --git a/LICENSE b/LICENSE index 2ac1f9a5..d156d1cc 100644 --- a/LICENSE +++ b/LICENSE @@ -205,7 +205,6 @@ APPENDIX A: How to apply the Apache License to your work. ------------------------------------------------------------- - APPENDIX B: Additional licenses relevant to this product. This product includes a number of source files with code that has been @@ -215,43 +214,6 @@ APPENDIX B: Additional licenses relevant to this product. conditions of the following licenses. - - ============================================================= - MIT License - ============================================================= - Original source code: - https://github.com/benjaminjack/python_cpp_example - ------------------------------------------------------------- - Copyright (c) 2017 Benjamin R. Jack - - MIT License (https://opensource.org/licenses/MIT): - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - ------------------------------------------------------------- - Code locations: - * https://github.com/apache/datasketches-cpp/blob/master/setup.py - that is adapted from the above. - - - ============================================================= Boost License (https://www.boost.org/LICENSE_1_0.txt) ============================================================= @@ -287,44 +249,6 @@ APPENDIX B: Additional licenses relevant to this product. of CMake configuration if configured to build tests. - ============================================================= - BSD License - ============================================================= - Original source code: - https://github.com/pybind/pybind11/blob/master/LICENSE - - Copyright (c) 2016 Wenzel Jakob , All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - 3. Neither the name of the copyright holder nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - ------------------------------------------------------------- - Code Locations: - Found only in the convenience binaries distributed from PyPI, which rely - on pybind11 code during compilation. - - ============================================================= Public Domain ============================================================= diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 60da7ede..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -global-include CMakeLists.txt -global-include *.cpp -global-include *.c -global-include *.hpp -global-include *.h -global-include *.bin -global-include *.in - -graft cmake -graft common -graft cpc -graft fi -graft hll -graft kll -graft req -graft theta -graft tuple -graft sampling -graft python - -# exclusions appear after including subdirectories -prune build diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 42479bc0..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[build-system] -requires = ["wheel", - "setuptools >= 30.3.0", - "cmake >= 3.16", - "pybind11[global] >= 2.6.0"] -build-backend = "setuptools.build_meta" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt deleted file mode 100644 index 508e1734..00000000 --- a/python/CMakeLists.txt +++ /dev/null @@ -1,87 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") -find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) -else() -find_package(Python3 COMPONENTS Interpreter Development REQUIRED) -endif() - -# only Windows+MSVC seems to have trouble locating pybind11 -if (MSVC) - execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE EXTRA_PACKAGE_PATH) - set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH}) -endif() - -find_package(pybind11 CONFIG REQUIRED) - -pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO) - -target_link_libraries(python - PRIVATE - common - hll - kll - cpc - fi - theta - tuple - sampling - req - quantiles - count - density - pybind11::module -) - -set_target_properties(python PROPERTIES - PREFIX "" - OUTPUT_NAME _datasketches -) - -target_include_directories(python - PUBLIC - $/include> - $ -) - -# ensure we make a .so on Mac rather than .dylib -if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - set_target_properties(python PROPERTIES SUFFIX ".so") -endif() - -target_sources(python - PRIVATE - src/datasketches.cpp - src/hll_wrapper.cpp - src/kll_wrapper.cpp - src/cpc_wrapper.cpp - src/fi_wrapper.cpp - src/theta_wrapper.cpp - src/tuple_wrapper.cpp - src/vo_wrapper.cpp - src/req_wrapper.cpp - src/quantiles_wrapper.cpp - src/density_wrapper.cpp - src/ks_wrapper.cpp - src/count_wrapper.cpp - src/vector_of_kll.cpp - src/py_serde.cpp -) diff --git a/python/README.md b/python/README.md deleted file mode 100644 index 33742cde..00000000 --- a/python/README.md +++ /dev/null @@ -1,101 +0,0 @@ -Apache DataSketchs Logo - -# The Apache DataSketches Library for Python - -This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library. - -In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis. - -If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution. - -This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness). - -## Building and Installation - -Once cloned, the library can be installed by running `python3 -m pip install .` in the project root directory -- not the python subdirectory -- which will also install the necessary dependencies, namely NumPy and [pybind11[global]](https://github.com/pybind/pybind11). - -If you prefer to call the `setup.py` build script directly, which is discouraged, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`. - -The library is also available from PyPI via `python3 -m pip install datasketches`. - -## Usage - -Having installed the library, loading the Apache DataSketches Library in Python is simple: `import datasketches`. - -The unit tests are mostly structured in a tutorial style and can be used as a reference example for how to feed data into and query the different types of sketches. - -## Available Sketch Classes - -- KLL (Absolute Error Quantiles) - - `kll_ints_sketch` - - `kll_floats_sketch` - - `kll_doubles_sketch` - - `kll_items_sketch` -- Quantiles (Absolute Error Quantiles, inferior algorithm) - - `quantiles_ints_sketch` - - `quantiles_floats_sketch` - - `quantiles_doubles_sketch` - - `quantiles_items_sketch` -- REQ (Relative Error Quantiles) - - `req_ints_sketch` - - `req_floats_sketch` - - `req_items_sketch` -- Frequent Items - - `frequent_strings_sketch` - - `frequent_items_sketch` - - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}` -- Theta - - `update_theta_sketch` - - `compact_theta_sketch` (cannot be instantiated directly) - - `theta_union` - - `theta_intersection` - - `theta_a_not_b` - - `theta_jaccard_similarity` -- Tuple - - `update_tuple_sketch` - - `compact_tuple_sketch` (cannot be instantiated directly) - - `tuple_union` - - `tuple_intersection` - - `tuple_a_not_b` - - `tuple_jaccard_similarity` -- HLL - - `hll_sketch` - - `hll_union` - - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}` -- CPC - - `cpc_sketch` - - `cpc_union` -- VarOpt Sampling - - `var_opt_sketch` - - `var_opt_union` -- Vector of KLL - - `vector_of_kll_ints_sketches` - - `vector_of_kll_floats_sketches` -- Kolmogorov-Smirnov Test - - `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches -- Density - - `density_sketch` -- Count-min sketch - - `count_min_sketch` - -## Known Differences from C++ - -The Python API largely mirrors the C++ API, with a few minor exceptions: The primary known differences are that Python on modern platforms does not support unsigned integer values or numeric values with fewer than 64 bits. As a result, you may not be able to produce identical sketches from within Python as you can with Java and C++. Loading those sketches after they have been serialized from another language will work as expected. - -The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors. - -We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments. - -## Developer Instructions - -The only developer-specific instructions relate to running unit tests. - -### Unit tests - -The Python unit tests are run via `tox`, with no arguments, from the project root directory -- not the python subdirectory. Tox creates a temporary virtual environment in which to build and run the unit tests. In the event you are missing the necessary package, tox may be installed with `python3 -m pip install --upgrade tox`. - -## License - -The Apache DataSketches Library is distributed under the Apache 2.0 License. - -There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license. diff --git a/python/datasketches/DensityWrapper.py b/python/datasketches/DensityWrapper.py deleted file mode 100644 index 0e66e7f3..00000000 --- a/python/datasketches/DensityWrapper.py +++ /dev/null @@ -1,87 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np - -from _datasketches import _density_sketch, KernelFunction -from .KernelFunction import GaussianKernel - -class density_sketch: - """An instance of a Density Sketch for kernel density estimation. Requires a KernelFunction object.""" - - def __init__(self, k:int, dim:int, kernel:KernelFunction=GaussianKernel()): - self._kernel = kernel - self._gadget = _density_sketch(k, dim, self._kernel) - - @classmethod - def deserialize(cls, data:bytes, kernel:KernelFunction=GaussianKernel()): - """Reads a bytes object and returns a density sketch, using the provided kerenl or defaulting to a Guassian kerenl""" - self = cls.__new__(cls) - self._kernel = kernel - self._gadget = _density_sketch.deserialize(data, kernel) - return self - - def update(self, point:np.array): - """Updates the sketch with the given point""" - self._gadget.update(point) - - def merge(self, other:'density_sketch'): - """Merges the provided sketch into this one""" - self._gadget.merge(other._gadget) - - def is_empty(self): - """Returns True if the sketch is empty, otherwise False""" - return self._gadget.is_empty() - - def get_k(self): - """Returns the configured parameter k""" - return self._gadget.get_k() - - def get_dim(self): - """Returns the configured parameter dim""" - return self._gadget.get_dim() - - def get_n(self): - """Returns the length of the input stream""" - return self._gadget.get_n() - - def get_num_retained(self): - """Returns the number of retained items (samples) in the sketch""" - return self._gadget.get_num_retained() - - def is_estimation_mode(self): - """Returns True if the sketch is in estimation mode, otherwise False""" - return self._gadget.is_estimation_mode() - - def get_estimate(self, point:np.array): - """Returns an approximate density at the given point""" - return self._gadget.get_estimate(point) - - def serialize(self): - """Serializes the sketch into a bytes object""" - return self._gadget.serialize() - - def __str__(self, print_levels:bool=False, print_items:bool=False): - """Produces a string summary of the sketch""" - return self._gadget.to_string(print_levels, print_items) - - def to_string(self, print_levels:bool=False, print_items:bool=False): - """Produces a string summary of the sketch""" - return self._gadget.to_string(print_levels, print_items) - - def __iter__(self): - return self._gadget.__iter__() diff --git a/python/datasketches/KernelFunction.py b/python/datasketches/KernelFunction.py deleted file mode 100644 index 7603b10e..00000000 --- a/python/datasketches/KernelFunction.py +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np - -from _datasketches import KernelFunction - -# This file provides an example Python Kernel Function implementation. -# -# Each implementation must extend the KernelFunction class -# and define the __call__ method - -# Implements a basic Gaussian Kernel -class GaussianKernel(KernelFunction): - def __init__(self, bandwidth: float=1.0): - KernelFunction.__init__(self) - self._bw = bandwidth - self._scale = -0.5 * (bandwidth ** -2) - - def __call__(self, a: np.array, b: np.array) -> float: - return np.exp(self._scale * np.linalg.norm(a - b)**2) diff --git a/python/datasketches/PySerDe.py b/python/datasketches/PySerDe.py deleted file mode 100644 index e4de82d2..00000000 --- a/python/datasketches/PySerDe.py +++ /dev/null @@ -1,110 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from _datasketches import PyObjectSerDe - -import struct - -# This file provides several Python SerDe implementation examples. -# -# Each implementation must extend the PyObjectSerDe class and define -# three methods: -# * get_size(item) returns an int of the number of bytes needed to -# serialize the given item -# * to_bytes(item) returns a bytes object representing a serialized -# version of the given item -# * from_bytes(data, offset) takes a bytes object (data) and an offset -# indicating where in the data array to start reading. The method -# returns a tuple with the newly reconstructed object and the -# total number of bytes beyond the offset read from the input data. - -# Implements a simple string-encoding scheme where a string is -# written as , with no null termination. -# This format allows pre-allocating each string, at the cost of -# additional storage. Using this format, the serialized string consumes -# 4 + len(item) bytes. -class PyStringsSerDe(PyObjectSerDe): - def get_size(self, item): - return int(4 + len(item)) - - def to_bytes(self, item: str): - b = bytearray() - b.extend(len(item).to_bytes(4, 'little')) - b.extend(map(ord,item)) - return bytes(b) - - def from_bytes(self, data: bytes, offset: int): - num_chars = int.from_bytes(data[offset:offset+3], 'little') - if (num_chars < 0 or num_chars > offset + len(data)): - raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}') - str = data[offset+4:offset+4+num_chars].decode() - return (str, 4+num_chars) - -# Implements an integer encoding scheme where each integer is written -# as a 32-bit (4 byte) little-endian value. -class PyIntsSerDe(PyObjectSerDe): - def get_size(self, item): - return int(4) - - def to_bytes(self, item): - return struct.pack(' int: - return int(0) - - def update_summary(self, summary: int, update: int) -> int: - summary += update - return summary - - def __call__(self, summary: int, update: int) -> int: - summary += update - return summary - - -# Implements a MAX rule, where the largest integer value is always kept -class MaxIntPolicy(TuplePolicy): - def __init__(self): - TuplePolicy.__init__(self) - - def create_summary(self) -> int: - return int(-sys.maxsize-1) - - def update_summary(self, summary: int, update: int) -> int: - return max(summary, update) - - def __call__(self, summary: int, update: int) -> int: - return max(summary, update) - - -# Implements a MIN rule, where the smallest integer value is always kept -class MinIntPolicy(TuplePolicy): - def __init__(self): - TuplePolicy.__init__(self) - - def create_summary(self) -> int: - return int(sys.maxsize) - - def update_summary(self, summary: int, update: int) -> int: - return min(summary, update) - - def __call__(self, summary: int, update: int) -> int: - return min(summary, update) diff --git a/python/datasketches/TupleWrapper.py b/python/datasketches/TupleWrapper.py deleted file mode 100644 index 97f08209..00000000 --- a/python/datasketches/TupleWrapper.py +++ /dev/null @@ -1,208 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from abc import ABC, abstractmethod - -from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch -from _datasketches import _tuple_union, _tuple_intersection -from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity -from _datasketches import PyObjectSerDe, theta_sketch, TuplePolicy - -class tuple_sketch(ABC): - """An abstract base class representing a Tuple Sketch.""" - _gadget: _tuple_sketch - - def __str__(self, print_items:bool=False): - return self._gadget.to_string(print_items) - - def is_empty(self): - """Returns True if the sketch is empty, otherwise False.""" - return self._gadget.is_empty() - - def get_estimate(self): - """Returns an estimate of the distinct count of the input stream.""" - return self._gadget.get_estimate() - - def get_upper_bound(self, num_std_devs:int): - """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}.""" - return self._gadget.get_upper_bound(num_std_devs) - - def get_lower_bound(self, num_std_devs:int): - """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}.""" - return self._gadget.get_lower_bound(num_std_devs) - - def is_estimation_mode(self): - """Returns True if the sketch is in estimation mode, otherwise False.""" - return self._gadget.is_estimation_mode() - - def get_theta(self): - """Returns theta (the effective sampling rate) as a fraction from 0 to 1.""" - return self._gadget.get_theta() - - def get_theta64(self): - """Returns theta as a 64-bit integer value.""" - return self._gadget.get_theta64() - - def get_num_retained(self): - """Returns the number of items currently in the sketch.""" - return self._gadget.get_num_retained() - - def get_seed_hash(self): - """Returns a hash of the seed used in the sketch.""" - return self._gadget.get_seed_hash() - - def is_ordered(self): - """Returns True if the sketch entries are sorder, otherwise False.""" - return self._gadget.is_ordered() - - def __iter__(self): - return self._gadget.__iter__() - - -class compact_tuple_sketch(tuple_sketch): - """An instance of a Tuple Sketch that has been compacted and can no longer accept updates.""" - - def __init__(self, other:tuple_sketch, ordered:bool = True): - if other == None: - self._gadget = None - else: - self._gadget = _compact_tuple_sketch(other, ordered) - - def serialize(self, serde:PyObjectSerDe): - """Serializes the sketch into a bytes object with the provided SerDe.""" - return self._gadget.serialize(serde) - - @classmethod - def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED): - """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value.""" - self = cls.__new__(cls) - self._gadget = _compact_tuple_sketch(sketch, summary, seed) - return self - - @classmethod - def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED): - """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch.""" - self = cls.__new__(cls) - self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed) - return self - - -class update_tuple_sketch(tuple_sketch): - """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values.""" - - def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED): - self._policy = policy - self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed) - - def update(self, datum, value): - """Updates the sketch with the provided item and summary value.""" - self._gadget.update(datum, value) - - def compact(self, ordered:bool = True) -> compact_tuple_sketch: - """Returns a compacted form of the sketch, optionally sorting it.""" - return self._gadget.compact(ordered) - - def trim(self): - """Removes retained entries in excess of the nominal size k (if any).""" - self._gadget.trim() - - def reset(self): - """Resets the sketch to the initial empty state.""" - self._gadget.reset() - -class tuple_union: - """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries.""" - _policy: TuplePolicy - - def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED): - self._policy = policy - self._gadget = _tuple_union(self._policy, lg_k, p, seed) - - def update(self, sketch:tuple_sketch): - """Updates the union with the given sketch.""" - self._gadget.update(sketch._gadget) - - def get_result(self, ordered:bool = True) -> compact_tuple_sketch: - """Returns the sketch corresponding to the union result, optionally sorted.""" - return compact_tuple_sketch(self._gadget.get_result(ordered), ordered) - - def reset(self): - """Resets the union to the initial empty state.""" - self._gadget.reset() - - -class tuple_intersection: - """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries.""" - _policy: TuplePolicy - - def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED): - self._policy = policy - self._gadget = _tuple_intersection(self._policy, seed) - - def update(self, sketch:tuple_sketch): - """Intersects the provided sketch with the current intersection state.""" - self._gadget.update(sketch._gadget) - - def has_result(self) -> bool: - """Returns True if the intersection has a valid result, otherwise False.""" - return self._gadget.has_result() - - def get_result(self, ordered:bool = True) -> compact_tuple_sketch: - """Returns the sketch corresponding to the intersection result, optionally sorted.""" - return compact_tuple_sketch(self._gadget.get_result(ordered), ordered) - - -class tuple_a_not_b: - """An object that can peform the A-not-B operation between two sketches.""" - def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED): - self._gadget = _tuple_a_not_b(seed) - - def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch: - """Returns a sketch with the result of applying the A-not-B operation on the given inputs.""" - return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget)) - - -class tuple_jaccard_similarity: - @staticmethod - def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED): - """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches.""" - return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed) - - @staticmethod - def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED): - """Returns True if sketch_a and sketch_b are equivalent, otherwise False.""" - return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed) - - @staticmethod - def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED): - """Tests similarity of an actual sketch against an expected sketch. - - Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches. - If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of - 97.7% and returns True, otherwise False. - """ - return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed) - - @staticmethod - def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED): - """Tests dissimilarity of an actual sketch against an expected sketch. - - Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches. - If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of - 97.7% and returns True, otherwise False. - """ - return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed) diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py deleted file mode 100644 index ac7295a6..00000000 --- a/python/datasketches/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""The Apache DataSketches Library for Python - -Provided under the Apache License, Verison 2.0 - -""" - -name = 'datasketches' - -from _datasketches import * - -from .PySerDe import * -from .TuplePolicy import * -from .KernelFunction import * - -# Wrappers around the pybind11 classes for cases where we -# need to define a python object that is persisted within -# the C++ object. Currently, the native python portion of -# a class derived from a C++ class may be garbage collected -# even though a pointer to the C++ portion remains valid. -from .TupleWrapper import * -from .DensityWrapper import * \ No newline at end of file diff --git a/python/include/kernel_function.hpp b/python/include/kernel_function.hpp deleted file mode 100644 index ca41c719..00000000 --- a/python/include/kernel_function.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -//#include -#include -#include - -#ifndef _KERNEL_FUNCTION_HPP_ -#define _KERNEL_FUNCTION_HPP_ - -namespace py = pybind11; - -namespace datasketches { - -/** - * @brief kernel_function provides the underlying base class from - * which native Python kernels ultimately inherit. The actual - * kernels implement KernelFunction, as shown in KernelFunction.py - */ -struct kernel_function { - virtual double operator()(py::array_t& a, const py::array_t& b) const = 0; - virtual ~kernel_function() = default; -}; - -/** - * @brief KernelFunction provides the "trampoline" class for pybind11 - * that allows for a native Python implementation of kernel - * functions. - */ -struct KernelFunction : public kernel_function { - using kernel_function::kernel_function; - - /** - * @brief Evaluates K(a,b), the kernel function for the given points a and b - * @param a the first vector - * @param b the second vector - * @return The function value K(a,b) - */ - double operator()(py::array_t& a, const py::array_t& b) const override { - PYBIND11_OVERRIDE_PURE_NAME( - double, // Return type - kernel_function, // Parent class - "__call__", // Name of function in python - operator(), // Name of function in C++ - a, b // Arguemnts - ); - } -}; - -/* The kernel_function_holder provides a concrete class that dispatches calls - * from the sketch to the kernel_function. This class is needed to provide a - * concrete object to produce a compiled library, but library users should - * never need to use this directly. - */ -struct kernel_function_holder { - explicit kernel_function_holder(std::shared_ptr kernel) : _kernel(kernel) {} - kernel_function_holder(const kernel_function_holder& other) : _kernel(other._kernel) {} - kernel_function_holder(kernel_function_holder&& other) : _kernel(std::move(other._kernel)) {} - kernel_function_holder& operator=(const kernel_function_holder& other) { _kernel = other._kernel; return *this; } - kernel_function_holder& operator=(kernel_function_holder&& other) { std::swap(_kernel, other._kernel); return *this; } - - double operator()(const std::vector& a, const py::array_t& b) const { - py::array_t a_arr(a.size(), a.data(), dummy_array_owner); - return _kernel->operator()(a_arr, b); - } - - double operator()(const std::vector& a, const std::vector& b) const { - py::array_t a_arr(a.size(), a.data(), dummy_array_owner); - py::array_t b_arr(b.size(), b.data(), dummy_array_owner); - return _kernel->operator()(a_arr, b_arr); - } - - private: - // a dummy object to "own" arrays when translating from std::vector to avoid a copy: - // https://github.com/pybind/pybind11/issues/323#issuecomment-575717041 - py::str dummy_array_owner; - std::shared_ptr _kernel; -}; - -} - -#endif // _KERNEL_FUNCTION_HPP_ \ No newline at end of file diff --git a/python/include/py_object_lt.hpp b/python/include/py_object_lt.hpp deleted file mode 100644 index d17fa7b1..00000000 --- a/python/include/py_object_lt.hpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _PY_OBJECT_LT_HPP_ -#define _PY_OBJECT_LT_HPP_ - -#include - -/* - This header defines a less than operator on generic python - objects. The implementation calls the object's built-in __lt__() - method. If that method is not defined, the call may fail. -*/ - -struct py_object_lt { - bool operator()(const pybind11::object& a, const pybind11::object& b) const { - return a < b; - } -}; - -#endif // _PY_OBJECT_LT_HPP_ \ No newline at end of file diff --git a/python/include/py_object_ostream.hpp b/python/include/py_object_ostream.hpp deleted file mode 100644 index fdcabff3..00000000 --- a/python/include/py_object_ostream.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _PY_OBJECT_OSTREAM_HPP_ -#define _PY_OBJECT_OSTREAM_HPP_ - -#include - -#include -#include - -/* - This header defines an ostream output operator on a generic python - object. The implementation calls the object's built-in __str__() - method. If that method is not defined, the call may fail. - - NOTE: This header must be included before the inclusion of - any sketch classes. -*/ - -namespace py = pybind11; - -namespace datasketches { - -static std::ostream& operator<<(std::ostream& os, const py::object& obj) { - os << std::string(pybind11::str(obj)); - return os; -} - -} - -#endif // _PY_OBJECT_OSTREAM_HPP_ diff --git a/python/include/py_serde.hpp b/python/include/py_serde.hpp deleted file mode 100644 index 3a5bb749..00000000 --- a/python/include/py_serde.hpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include - -#ifndef _PY_SERDE_HPP_ -#define _PY_SERDE_HPP_ - -namespace py = pybind11; - -namespace datasketches { - -/** - * @brief The py_object_serde is an abstract class that implements the - * datasketches serde interface, and is used to allow custom Python - * serialization of items wrapped as generic py::object types. The actual - * Python implementation classes must extend the PyObjectSerDe class. - */ -struct py_object_serde { - /** - * @brief Get the serialized size of an object, in bytes - * - * @param item A provided item - * @return int64_t The serialized size of the item, in bytes - */ - virtual int64_t get_size(const py::object& item) const = 0; - - /** - * @brief Serializes an item to a bytes object - * - * @param item A provided item - * @return The serialized image of the item as a Python bytes object - */ - virtual py::bytes to_bytes(const py::object& item) const = 0; - - /** - * @brief Constructs an object from a serialized image, reading the - * incoming buffer starting at the specified offset. - * - * @param bytes A buffer containing items from a serialized sketch - * @param offset The starting offset into the bytes buffer - * @return A Python tuple of the reconstructed item and the total number of bytes read - */ - virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0; - - virtual ~py_object_serde() = default; - - // these methods are required by the serde interface; see common/include/serde.hpp for - // default implementations for C++ std::string and numeric types. - size_t size_of_item(const py::object& item) const; - size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const; - size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const; -}; - -/** - * @brief The PyObjectSerDe class provides a concrete base class - * that pybind11 uses as a "trampoline" to pass calls through to - * the abstract py_object_serde class. Custom Python serde implementations - * must extend this class. - */ -struct PyObjectSerDe : public py_object_serde { - using py_object_serde::py_object_serde; - - // trampoline definitions -- need one for each virtual function - int64_t get_size(const py::object& item) const override { - PYBIND11_OVERRIDE_PURE( - int64_t, // Return type - py_object_serde, // Parent class - get_size, // Name of function in C++ (must match Python name) - item // Argument(s) - ); - } - - py::bytes to_bytes(const py::object& item) const override { - PYBIND11_OVERRIDE_PURE( - py::bytes, // Return type - py_object_serde, // Parent class - to_bytes, // Name of function in C++ (must match Python name) - item // Argument(s) - ); - } - - py::tuple from_bytes(py::bytes& bytes, size_t offset) const override { - PYBIND11_OVERRIDE_PURE( - py::tuple, // Return type - py_object_serde, // Parent class - from_bytes, // Name of function in C++ (must match Python name) - bytes, offset // Argument(s) - ); - } -}; - -} - -#endif // _PY_SERDE_HPP_ diff --git a/python/include/quantile_conditional.hpp b/python/include/quantile_conditional.hpp deleted file mode 100644 index 5a28d374..00000000 --- a/python/include/quantile_conditional.hpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _QUANTILE_CONDITIONAL_HPP_ -#define _QUANTILE_CONDITIONAL_HPP_ - -/* - This header defines conditionally compiled functions shared - across the set of quantile family sketches. -*/ - -#include "common_defs.hpp" -#include "py_serde.hpp" - -#include -#include - -namespace py = pybind11; - -// Serialization -// std::string and arithmetic types, where we don't need a separate serde -template::value || std::is_same::value, bool>::type = 0> -void add_serialization(py::class_& clazz) { - clazz.def( - "serialize", - [](const SK& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object." - ) - .def_static( - "deserialize", - [](const std::string& bytes) { return SK::deserialize(bytes.data(), bytes.size()); }, - py::arg("bytes"), - "Deserializes the sketch from a bytes object." - ); -} - -// py::object and other types where the caller must provide a serde -template::value && !std::is_same::value, bool>::type = 0> -void add_serialization(py::class_& clazz) { - clazz.def( - "serialize", - [](const SK& sk, datasketches::py_object_serde& serde) { - auto bytes = sk.serialize(0, serde); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, py::arg("serde"), - "Serializes the sketch into a bytes object using the provided serde." - ) - .def_static( - "deserialize", - [](const std::string& bytes, datasketches::py_object_serde& serde) { - return SK::deserialize(bytes.data(), bytes.size(), serde); - }, py::arg("bytes"), py::arg("serde"), - "Deserializes the sketch from a bytes object using the provided serde." - ); -} - -// Vector Updates -// * Only allowed for POD types based on numpy restriction, which -// is equivalent to both std::is_trivial and std::is_standard_layout. -// * Nothing is added to types that are not PODs. -// POD type -template::value && std::is_standard_layout::value, bool>::type = 0> -void add_vector_update(py::class_& clazz) { - clazz.def( - "update", - [](SK& sk, py::array_t items) { - if (items.ndim() != 1) { - throw std::invalid_argument("input data must have only one dimension. Found: " - + std::to_string(items.ndim())); - } - auto array = items.template unchecked<1>(); - for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i)); - }, - py::arg("array"), - "Updates the sketch with the values in the given array" - ); -} - -// non-POD type -template::value || !std::is_standard_layout::value, bool>::type = 0> -void add_vector_update(py::class_& clazz) { - unused(clazz); -} - -#endif // _QUANTILE_CONDITIONAL_HPP_ \ No newline at end of file diff --git a/python/include/tuple_policy.hpp b/python/include/tuple_policy.hpp deleted file mode 100644 index 368ae005..00000000 --- a/python/include/tuple_policy.hpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include - -#ifndef _TUPLE_POLICY_HPP_ -#define _TUPLE_POLICY_HPP_ - -namespace py = pybind11; - -namespace datasketches { - -/** - * @brief tuple_policy provides the underlying base class from - * which native Python policies ultimately inherit. The actual - * policies implement TuplePolicy, as shown in TuplePolicy.py - */ -struct tuple_policy { - virtual py::object create_summary() const = 0; - virtual py::object update_summary(py::object& summary, const py::object& update) const = 0; - virtual py::object operator()(py::object& summary, const py::object& update) const = 0; - virtual ~tuple_policy() = default; -}; - -/** - * @brief TuplePolicy provides the "trampoline" class for pybind11 - * that allows for a native Python implementation of tuple - * sketch policies. - */ -struct TuplePolicy : public tuple_policy { - using tuple_policy::tuple_policy; - - /** - * @brief Create a summary object - * - * @return py::object representing a new summary - */ - py::object create_summary() const override { - PYBIND11_OVERRIDE_PURE( - py::object, // Return type - tuple_policy, // Parent class - create_summary, // Name of function in C++ (must match Python name) - // Argument(s) -- if any - ); - } - - /** - * @brief Update a summary object using this policy - * - * @param summary The current summary to update - * @param update The new value with which to update the summary - * @return py::object The updated summary - */ - py::object update_summary(py::object& summary, const py::object& update) const override { - PYBIND11_OVERRIDE_PURE( - py::object, // Return type - tuple_policy, // Parent class - update_summary, // Name of function in C++ (must match Python name) - summary, update // Arguments - ); - } - - /** - * @brief Applies this policy to summary with the provided update - * - * @param summary The current summary on which to apply the policy - * @param update An update to apply to the current summary - * @return py::object The potentially modified summary - */ - py::object operator()(py::object& summary, const py::object& update) const override { - PYBIND11_OVERRIDE_PURE_NAME( - py::object, // Return type - tuple_policy, // Parent class - "__call__", // Name of function in python - operator(), // Name of function in C++ - summary, update // Arguemnts - ); - } -}; - -/* The tuple_policy_holder provides a concrete class that dispatches calls - * from the sketch to the tuple_policy. This class is needed to provide a - * concrete object to produce a compiled library, but library users should - * never need to use this directly. - */ -struct tuple_policy_holder { - explicit tuple_policy_holder(std::shared_ptr policy) : _policy(policy) {} - tuple_policy_holder(const tuple_policy_holder& other) : _policy(other._policy) {} - tuple_policy_holder(tuple_policy_holder&& other) : _policy(std::move(other._policy)) {} - tuple_policy_holder& operator=(const tuple_policy_holder& other) { _policy = other._policy; return *this; } - tuple_policy_holder& operator=(tuple_policy_holder&& other) { std::swap(_policy, other._policy); return *this; } - - py::object create() const { return _policy->create_summary(); } - - void update(py::object& summary, const py::object& update) const { - summary = _policy->update_summary(summary, update); - } - - void operator()(py::object& summary, const py::object& update) const { - summary = _policy->operator()(summary, update); - } - - private: - std::shared_ptr _policy; -}; - -/* A degenerate policy used to enable Jaccard Similarity on tuple sketches, - * where the computation requires a union and intersection over the keys but - * does not need to observe the summaries. - */ -struct dummy_jaccard_policy { - void operator()(py::object&, const py::object&) const { - return; - } -}; - -} - -#endif // _TUPLE_POLICY_HPP_ \ No newline at end of file diff --git a/python/jupyter/CPCSketch.ipynb b/python/jupyter/CPCSketch.ipynb deleted file mode 100644 index 0b7caffc..00000000 --- a/python/jupyter/CPCSketch.ipynb +++ /dev/null @@ -1,345 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## CPC Sketch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Sketch Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from datasketches import cpc_sketch, cpc_union" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll create a sketch with log2(k) = 12" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "sk = cpc_sketch(12)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### CPC sketch summary:\n", - " lgK : 12\n", - " seed hash : 93cc\n", - " C : 38212\n", - " flavor : 4\n", - " merged : false\n", - " compressed : false\n", - " intresting col : 5\n", - " HIP estimate : 2.09721e+06\n", - " kxp : 11.4725\n", - " offset : 6\n", - " table : allocated\n", - " num SV : 135\n", - " window : allocated\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "n = 1 << 21\n", - "for i in range(0, n):\n", - " sk.update(i)\n", - "print(sk)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Upper bound (1 std. dev) as % of true value: 100.9281\n" - ] - } - ], - "source": [ - "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimate as % of true value: 100.0026\n" - ] - } - ], - "source": [ - "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lower bound (1 std. dev) as % of true value: 99.0935\n" - ] - } - ], - "source": [ - "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can serialize and deserialize the sketch, which will give us back the same structure." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2484" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sk_bytes = sk.serialize()\n", - "len(sk_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### CPC sketch summary:\n", - " lgK : 12\n", - " seed hash : 93cc\n", - " C : 38212\n", - " flavor : 4\n", - " merged : false\n", - " compressed : false\n", - " intresting col : 5\n", - " HIP estimate : 2.09721e+06\n", - " kxp : 11.4725\n", - " offset : 6\n", - " table : allocated\n", - " num SV : 135\n", - " window : allocated\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "sk2 = cpc_sketch.deserialize(sk_bytes)\n", - "print(sk2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sketch Union Usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "k = 12\n", - "n = 1 << 20\n", - "offset = int(3 * n / 4)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "sk1 = cpc_sketch(k)\n", - "sk2 = cpc_sketch(k + 1)\n", - "for i in range(0, n):\n", - " sk1.update(i)\n", - " sk2.update(i + offset)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "union = cpc_union(k+1)\n", - "union.update(sk1)\n", - "union.update(sk2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note how log config k has automatically adopted the value of the smaller input sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### CPC sketch summary:\n", - " lgK : 12\n", - " seed hash : 93cc\n", - " C : 37418\n", - " flavor : 4\n", - " merged : true\n", - " compressed : false\n", - " intresting col : 5\n", - " HIP estimate : 0\n", - " kxp : 4096\n", - " offset : 6\n", - " table : allocated\n", - " num SV : 123\n", - " window : allocated\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "result = union.get_result()\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can again compare against the exact result, in this case 1.75*n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimate as % of true value: 99.6646\n" - ] - } - ], - "source": [ - "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/jupyter/FrequentItemsSketch.ipynb b/python/jupyter/FrequentItemsSketch.ipynb deleted file mode 100644 index bc56ed73..00000000 --- a/python/jupyter/FrequentItemsSketch.ipynb +++ /dev/null @@ -1,354 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Frequent Items Sketch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Sketch Usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "More so than other sketches in the library, the Frequent Items sketch can take some practice to use since it identifies exceptionally heavy hitters rather than returning a \"top N\" list. We assume readers have already familiarized themselves with the [sketch documentation](https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html) and are aware of the key concepts around use of this sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from datasketches import frequent_strings_sketch, frequent_items_error_type" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll use a very small sketch in this case so that we can easily fill it, otherwise the difference between error types is more difficult to demonstrate." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "k = 3\n", - "fi = frequent_strings_sketch(k)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A brief digression into implementation details to help explain what we're doing here. The Frequent Items sketch maintains a list of items, but purges the least frequent items when the list fills. For this example, we'll keep inserting items until after a purge takes place.\n", - "\n", - "We'll insert items with exponentially decreasing weights, which in this case gives us a more interesting set of results when we later query things." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Update 1: 1 items\n", - "Update 2: 2 items\n", - "Update 3: 3 items\n", - "Update 4: 4 items\n", - "Update 5: 5 items\n", - "Update 6: 6 items\n", - "Update 7: 3 items\n", - "Update 8: 4 items\n" - ] - } - ], - "source": [ - "n = 8\n", - "for i in range(0,n):\n", - " fi.update(str(i), 2 ** (n-i))\n", - " i += 1\n", - " print('Update ' + str(i) + ': ' + str(fi.get_num_active_items()) + ' items')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see where the purge happened, and in this case we inserted a low-weight item after the purge. We can now compare querying items to exclude either false positives or false negatives.\n", - " - `NO_FALSE_POSITIVES` returns all items with a _lower_ bound above the a posteriori error\n", - " - `NO_FALSE_NEGATIVES` returns all items with an _upper_ bound above the a posteriori error\n", - "\n", - "The latter option will always include any results from the first set and may include others. Items are returned as (id, estimate, lower_bound, upper_bound) and are sorted by decreasing weight." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('0', 256, 224, 256), ('1', 128, 96, 128)]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('0', 256, 224, 256),\n", - " ('1', 128, 96, 128),\n", - " ('2', 64, 32, 64),\n", - " ('7', 34, 2, 34)]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sketch also allows us to query for individual items directly." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "256\n", - "64\n", - "2\n" - ] - } - ], - "source": [ - "print(fi.get_estimate(\"0\"))\n", - "print(fi.get_upper_bound(\"2\"))\n", - "print(fi.get_lower_bound(\"7\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also query for items not in the the list, whether the item has never been seen or if it has been evicted from the active set." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fi.get_estimate(\"5\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sketch may also be serialized for archiving, and reconstructed." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "84" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sk_bytes = fi.serialize()\n", - "len(sk_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Frequent items sketch summary:\n", - " lg cur map size : 3\n", - " lg max map size : 3\n", - " num active items : 4\n", - " total weight : 510\n", - " max error : 32\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "fi2 = frequent_strings_sketch.deserialize(sk_bytes)\n", - "print(fi2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merging Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Frequent Items sketches support `merge()` to combine sketches. Keep in mind that the combined sketches may not have any meaningfully frequent items, even if there were frequent items in one of the input sketches.\n", - "\n", - "We'll start by creating a sketch with lots of equally-weighted very light items, but with a combined weight several times greater than that of the first sketch, and then merge that into the first sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "fi2 = frequent_strings_sketch(k)\n", - "wt = fi.get_total_weight()\n", - "for i in range(0,4*wt):\n", - " fi2.update(str(i))\n", - "fi.merge(fi2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even though all these new items have weight 1, there are so many of them that we have nothing if we ask for no fasle positives." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We do, however, see a few potentially heavy items if we request no false negatives." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/jupyter/HLLSketch.ipynb b/python/jupyter/HLLSketch.ipynb deleted file mode 100644 index 93332ce8..00000000 --- a/python/jupyter/HLLSketch.ipynb +++ /dev/null @@ -1,346 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## HLL Sketch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Sketch Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from datasketches import hll_sketch, hll_union, tgt_hll_type" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll create a sketch with log2(k) = 12" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "sk = hll_sketch(12)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### HLL SKETCH SUMMARY: \n", - " Log Config K : 12\n", - " Hll Target : HLL_4\n", - " Current Mode : HLL\n", - " LB : 2.06958e+06\n", - " Estimate : 2.09635e+06\n", - " UB : 2.12379e+06\n", - " OutOfOrder flag: 0\n", - " CurMin : 7\n", - " NumAtCurMin : 72\n", - " HipAccum : 2.09635e+06\n", - " KxQ0 : 5.80703\n", - " KxQ1 : 0\n", - "\n" - ] - } - ], - "source": [ - "n = 1 << 21\n", - "for i in range(0, n):\n", - " sk.update(i)\n", - "print(sk)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Upper bound (1 std. dev) as % of true value: 101.2703\n" - ] - } - ], - "source": [ - "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimate as % of true value: 99.9618\n" - ] - } - ], - "source": [ - "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lower bound (1 std. dev) as % of true value: 98.6852\n" - ] - } - ], - "source": [ - "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can serialize and deserialize the sketch, which will give us back the same structure." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2096" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sk_bytes = sk.serialize_compact()\n", - "len(sk_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### HLL SKETCH SUMMARY: \n", - " Log Config K : 12\n", - " Hll Target : HLL_4\n", - " Current Mode : HLL\n", - " LB : 2.06958e+06\n", - " Estimate : 2.09635e+06\n", - " UB : 2.12379e+06\n", - " OutOfOrder flag: 0\n", - " CurMin : 7\n", - " NumAtCurMin : 72\n", - " HipAccum : 2.09635e+06\n", - " KxQ0 : 5.80703\n", - " KxQ1 : 0\n", - "\n" - ] - } - ], - "source": [ - "sk2 = hll_sketch.deserialize(sk_bytes)\n", - "print(sk2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sketch Union Usage" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "k = 12\n", - "n = 1 << 20\n", - "offset = int(3 * n / 4)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "sk1 = hll_sketch(k)\n", - "sk2 = hll_sketch(k + 1)\n", - "for i in range(0, n):\n", - " sk1.update(i)\n", - " sk2.update(i + offset)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "union = hll_union(k+1)\n", - "union.update(sk1)\n", - "union.update(sk2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note how log config k has automatically adopted the value of the smaller input sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### HLL SKETCH SUMMARY: \n", - " Log Config K : 12\n", - " Hll Target : HLL_4\n", - " Current Mode : HLL\n", - " LB : 1.80197e+06\n", - " Estimate : 1.83108e+06\n", - " UB : 1.86121e+06\n", - " OutOfOrder flag: 1\n", - " CurMin : 6\n", - " NumAtCurMin : 2\n", - " HipAccum : 1.76932e+06\n", - " KxQ0 : 6.60752\n", - " KxQ1 : 0\n", - "\n" - ] - } - ], - "source": [ - "result = union.get_result()\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can again compare against the exact result, in this case 1.75*n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimate as % of true value: 99.7859\n" - ] - } - ], - "source": [ - "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/jupyter/KLLSketch.ipynb b/python/jupyter/KLLSketch.ipynb deleted file mode 100644 index 64e43686..00000000 --- a/python/jupyter/KLLSketch.ipynb +++ /dev/null @@ -1,463 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## KLL Sketch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Sketch Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from datasketches import kll_floats_sketch, kll_ints_sketch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Options are a `kll_floats_sketch` or `kll_ints_sketch`. We'll use the former so we can draw samples from a Gaussian distribution. We start by creating a sketch with $k=200$, which gives a normalized rank error of about 1.65%, and feeding in 1 million points." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "n = 1000000\n", - "kll = kll_floats_sketch(200)\n", - "from numpy.random import randn\n", - "for i in range(0, n):\n", - " kll.update(randn()) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the data is distributed as $\\cal{N}(0,1)$, 0.0 should be near the median rank (0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.497608" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kll.get_rank(0.0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the median should also be near 0.0" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.003108405973762274" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kll.get_quantile(0.5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We track the min and max values as well. They are stored separately from the quantile data so we can always determine the full _empirical_ data range. In this case they should be very roughly symmetric around 0.0. We can query these values explicitly, or implicitly by asking for the values at ranks 0.0 and 1.0." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-4.6000142097473145, 4.779754638671875]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[kll.get_min_value(), kll.get_max_value()]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-4.6000142097473145, 4.779754638671875]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kll.get_quantiles([0.0, 1.0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And out of curiosity, we can check how many items the sketch has seen and how many it is retaining" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1000000" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kll.get_n()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "614" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kll.get_num_retained()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can serialize the sketch for archiving, and reconstruct it later. Note that the serialized image does _not_ contain information on whether it is a floats or ints sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2536" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sk_bytes = kll.serialize()\n", - "len(sk_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### KLL sketch summary:\n", - " K : 200\n", - " min K : 200\n", - " M : 8\n", - " N : 1000000\n", - " Epsilon : 1.33%\n", - " Epsilon PMF : 1.65%\n", - " Empty : false\n", - " Estimation mode: true\n", - " Levels : 13\n", - " Sorted : true\n", - " Capacity items : 617\n", - " Retained items : 614\n", - " Storage bytes : 2536\n", - " Min value : -4.6\n", - " Max value : 4.78\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "kll2 = kll_floats_sketch.deserialize(sk_bytes)\n", - "print(kll2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merging Sketches" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "KLL sketches have a `merge()` operation to combine sketches. The resulting sketch will have no worse error boudns than if the full data had been sent to a single sketch." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our previous sketch used $\\cal{N}(0,1)$, so now we'll generate a shifted Gaussian distributed as $\\cal{N}(4,1)$. For added variety, we can use half as many points. The next section will generate a plot, so we will defer queries of the merged skech to that section." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "sk2 = kll_floats_sketch(200)\n", - "for i in range(0, int(n/2)):\n", - " sk2.update(4 + randn())" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### KLL sketch summary:\n", - " K : 200\n", - " min K : 200\n", - " M : 8\n", - " N : 1500000\n", - " Epsilon : 1.33%\n", - " Epsilon PMF : 1.65%\n", - " Empty : false\n", - " Estimation mode: true\n", - " Levels : 13\n", - " Sorted : false\n", - " Capacity items : 617\n", - " Retained items : 580\n", - " Storage bytes : 2400\n", - " Min value : -4.6\n", - " Max value : 9.06\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "kll.merge(sk2)\n", - "print(kll)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generating Histograms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The KLL sketch allows us compute histograms via the probability mass function (pmf). Since histograms are a typical plot type when visualizing data distributions, we will create such a figure. To instead create a cumulative distribution function (cdf) from the sketch, simply replace the call to `get_pmf()` with `get_cdf()`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want our x-axis to have evenly distributed bins, so the first step is to split the empirical data range\n", - "into a set of bins." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "xmin = kll.get_min_value()\n", - "num_splits = 30\n", - "step = (kll.get_max_value() - xmin) / num_splits\n", - "splits = [xmin + (i*step) for i in range(0, num_splits)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`get_pmf()` returns the probability mass in the range $(x_{i-1}, x_i]$, for each bin $i$. If we use the minimum value for $x_{i-1}$ this covers the low end, but `get_pmf()` also returns an extra bin with all mass greater than the last-provided split point. As a result, the pmf array is 1 larger than the list of split points. We need to be sure to append a value to the split points for plotting." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "pmf = kll.get_pmf(splits)\n", - "x = splits # this will hold the x-axis values, so need to append the max value\n", - "x.append(kll.get_max_value())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need some plotting-related imports and options" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "sns.set(color_codes=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using a negative width in the plot gives right-aligned bins, which matches the bin definition noted earlier." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD7CAYAAABpJS8eAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUgklEQVR4nO3dfYxc11nH8e+u7V3b8Tpt3aliJyQISh8UBHFoCCp2oCgG1IjUlCSNFKM0ahs3qigvSkVVxYHwkgoaBVcWKVQJqgPutrSuqJvWSSs3hhSqNKWKE4mkj2glAjhGmA0Q240d22v+mLvRdLu7c3d3dmdmz/cjWZp7zrnrZ95+e/fMvWcGzp07hyRp6RvsdgGSpMVh4EtSIQx8SSqEgS9JhTDwJakQy7tdwDSGgZ8CjgBnu1yLJPWLZcB64BvAqcmdvRr4PwV8tdtFSFKfugr4h8mNvRr4RwD+539OMD7eW9cJrFu3hrGx490uoxZrXRj9VCv0V73WOj+DgwO8+tXnQZWhk9UK/Ii4CdgBDAE7M/O+acY9CBzMzN3V9ibgI8AKYAx4Z2Y+V+O/PAswPn6u5wIf6MmapmOtC6OfaoX+qtdaO2LKqfC2H9pGxIXA3cBm4DJge0RcOmnMhoh4CLhh0u6fAN6VmRur27vmULgkqQPqnKWzBXg0M1/IzBPAXuD6SWO2AfuAT080RMQwsCMzn66angYunn/JkqS5qDOls4HvnQ86AlzZOiAz7wGIiM0tbaeAPVX7IHAX8Ln5lStJmqs6gT8wRdt43f8gIoaAB6v/60N194PmhyK9qNEY6XYJtVnrwuinWqG/6rXWhVMn8A/TPMVnwnrg+To/PCLWAJ+n+YHt1sw8PZvixsaO99yHIo3GCEePHut2GbVY68Lop1qhv+q11vkZHByY8UC5TuAfAO6KiAZwArgO2F7z/98DfBt4T2b2VnJLUmHafmibmYeBO4CDwCFgNDOfiIj9EXHFdPtFxOXAVmAT8GREHIqI/R2qW5I0S7XOw8/MUWB0Uts1U4y7peX2k0w9/6+CjKxdxcrhmV9mJ0+dWaRqpLL16pW2WiJWDi/n2tv3zTjmoXu3LlI1UtlcLVOSCuERvnrCy6fPtj3F7eSpMxx78aVFqkhaegx89YShFctqTf301klwUn9xSkeSCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBVieZ1BEXETsAMYAnZm5n3TjHsQOJiZu6vti4E9wOuABLZl5vEO1K1CjaxdxcrhmV+2J0+d4diLLy1SRVL/aBv4EXEhcDfwRuAU8LWIOJiZz7SM2QB8DLgaONiy+0eBj2bmpyLiTuBO4AMdrF+FWTm8nGtv3zfjmIfu3cqxRapH6id1pnS2AI9m5guZeQLYC1w/acw2YB/w6YmGiFgB/Gw1HmA3cMN8C5YkzU2dKZ0NwJGW7SPAla0DMvMegIjY3NL8WuDFzDzTst9Fcy9VkjQfdQJ/YIq28QXc7xXr1q2ZzfBF02iMdLuE2vqp1k5a6Pvdb49rP9VrrQunTuAfBq5q2V4PPF9jv6PA2ohYlplnZ7HfK8bGjjM+fm42uyy4RmOEo0f7Y4a4F2rt1htiIe93Lzyus9FP9Vrr/AwODsx4oFxnDv8AcHVENCJiNXAd8Ei7nTLzNPBV4Maq6Wbg4Rr/nyRpAbQN/Mw8DNxB8+ybQ8BoZj4REfsj4oo2u78X2B4Rz9D8K2HHfAuWJM1NrfPwM3MUGJ3Uds0U426ZtP0c8Oa5lydJ6hSvtJWkQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgph4EtSIWotniZNVvfLxCX1DgNfc1L3y8Ql9Q6ndCSpEAa+JBXCwJekQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgpRa7XMiLgJ2AEMATsz875J/RuB+4HzgceA2zLzTET8IPBXwFrgf4F3ZOZznStfklRX2yP8iLgQuBvYDFwGbI+ISycN2wO8LzPfAAwAt1btfwh8MjM3Ap+tfo4kqQvqTOlsAR7NzBcy8wSwF7h+ojMiLgFWZebjVdNu4Ibq9jKaR/cA5wEvdaJoSdLs1ZnS2QAcadk+AlzZpv+i6vadwNci4jdoTge9aTbFrVu3ZjbDF02jMdLtEmrrp1o7aaHvd789rv1Ur7UunDqBPzBF23jN/geB7Zm5LyKuA/42In4iM8/VKW5s7Djj47WGLppGY4SjR491u4xaFrLWXn+hL+Rz1E+vAeiveq11fgYHB2Y8UK4zpXMYuKBlez3wfLv+iGgAP5qZ+wAy87PVuNfWK12S1El1Av8AcHVENCJiNXAd8MhEZ3XWzcmI2FQ13Qw8DPx31b4ZoOo/lplHO3kHJEn1tA38zDwM3AEcBA4Bo5n5RETsj4grqmHbgJ0R8SzND2d3VdM2vwrcGxFPAx+m+ctCktQFtc7Dz8xRYHRS2zUtt5/iez/InWh/AvjpedYoSeoAr7SVpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBXCwJekQtT6xiupn7x8+iyNxkjbcSdPneHYiy8tQkVSbzDwteQMrVjGtbfvazvuoXu3cmwR6pF6hVM6klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVotZ5+BFxE7ADGAJ2ZuZ9k/o3AvcD5wOPAbdl5pmIWA88AGwAvgtsy8x/7Vz5kqS62h7hR8SFwN3AZuAyYHtEXDpp2B7gfZn5BmAAuLVq/2vgocy8vLr9J50qXJI0O3WmdLYAj2bmC5l5AtgLXD/RGRGXAKsy8/GqaTdwQ0S8luYviI9V7R+n+VeCJKkL6kzpbACOtGwfAa5s038R8MPAvwE7I+Lnq9u/Ppvi1q1bM5vhi6bOOi29Yra1vnz6LEMrli1QNb1nrs9lr7wG6j5f579qdd88r73y2NbRT7VCvcAfmKJtvEb/cuBy4Pcy87ci4t3Ag8Cb6xY3Nnac8fFzdYcvikZjhKNH+2MFlrnU2miM1F6HZimYy3PZS6+B2TxfvVLzTHrpsW2nF2sdHByY8UC5zpTOYeCClu31wPM1+v8TOJaZX6jaR/nevwwkSYuoTuAfAK6OiEZErAauAx6Z6MzM54CTEbGparoZeDgzvwMcjoi3VO3XAt/sXOmS6ppYMrrdv5G1q7pdqhZQ2ymdzDwcEXcAB2melvlAZj4REfuB383MfwK2AfdHxAjwJLCr2v1twMci4h7gReAdC3EnJM3MJaMFNc/Dz8xRmlMyrW3XtNx+iimmazIzmcWcvSRp4XilrSQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpELW+AEXS4htZu4qVw75F1Tm+mqQetXJ4eduvJXzo3q2LVI2WAqd0JKkQBr4kFcLAl6RCOIcv6RUvnz5LozEy45iTp85w7MWXFqkidZKBL+kVQyuW1fqg+Ngi1aPOckpHkgph4EtSIQx8SSpErTn8iLgJ2AEMATsz875J/RuB+4HzgceA2zLzTEv/5cDjmTncqcIlSbPT9gg/Ii4E7gY2A5cB2yPi0knD9gDvy8w3AAPArS37rwb+jOYvC0lSl9SZ0tkCPJqZL2TmCWAvcP1EZ0RcAqzKzMerpt3ADS373wvs7Ey5kqS5qhP4G4AjLdtHgIvq9EfEW4HVmbl3nnVKkuapzhz+wBRt4+36I+ICmvP+W+ZSGMC6dWvmuuuCandhSi/pp1q7Ya6PT+mP60Le/356bPupVqgX+IeBq1q21wPPT+q/YIr+XwbWAY9FBAARcQi4KjNrXbcxNnac8fFzdYYumkZjhKNH++Oyk7nU2m8v4Pmay3O5WK+BXn4uFur+L/X310IbHByY8UC5TuAfAO6KiAZwArgO2D7RmZnPRcTJiNiUmf8I3Aw8nJkPAA9MjIuIc5m5cY73Q5I0T23n8DPzMHAHcBA4BIxm5hMRsT8irqiGbQN2RsSzwHnAroUqWJI0N7XOw8/MUWB0Uts1LbefAq5s8zOmmuuXJC0Sr7SVpEIY+JJUCANfkgrhevgqVp0v+4DOf+HHyNpVrBye+a138tSZGfuluTDwVaw6X/YBnf/Cj5XDy2t9yYjUaQZ+IeocVUpa2kyAQtQ5qgSPLKWlzA9tJakQBr4kFcLAl6RCGPiSVAgDX5IK4Vk6kmat7sVjnbxgTfNn4EuatboXj/XW14PIKR1JKoSBL0mFMPAlqRDO4UttTLWq5uTtidUtXQVTvczAl9qos6rmxBpEroKpXuaUjiQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9Jhah1Hn5E3ATsAIaAnZl536T+jcD9wPnAY8BtmXkmIjYBHwFWAGPAOzPzuQ7WL0mqqe0RfkRcCNwNbAYuA7ZHxKWThu0B3peZbwAGgFur9k8A78rMjdXtXZ0qXJI0O3WmdLYAj2bmC5l5AtgLXD/RGRGXAKsy8/GqaTdwQ0QMAzsy8+mq/Wng4o5VLkmalTpTOhuAIy3bR4Ar2/RflJmnaB75ExGDwF3A5+ZTrCRp7uoE/sAUbeN1+yNiCHiw+r8+NJvi1q1bM5vhi2bywlm9rJ9q1dIzl9dfP71m+6lWqBf4h4GrWrbXA89P6r9gqv6IWAN8nuYHtlsz8/RsihsbO874+LnZ7LLgGo0Rjh7tj+/xaa21316YWhpm+17p1/dXrxgcHJjxQLnOHP4B4OqIaETEauA64JGJzuqsm5PVGTkANwMPV7f3AN8G3l5N8UiSuqRt4GfmYeAO4CBwCBjNzCciYn9EXFEN2wbsjIhngfOAXRFxObAV2AQ8GRGHImL/gtwLSVJbtc7Dz8xRYHRS2zUtt5/iez/IBXiSqef3JUld4JW2klQIA1+SCmHgS1IhDHxJKoRfYi5pwYysXcXK4Zlj5uSpMxx78aVFqqhsBr6kBbNyeDnX3r5vxjEP3buV3rp8aeky8JeAmY6ivMJW0gQDfwmoexQlqWx+aCtJhTDwJakQBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhBdeSeq61qvFp7s63DV35s/Al9R1rrmzOJzSkaRCGPiSVAgDX5IKYeBLUiEMfEkqhGfp9LC6Xw8nSXUY+D3MLzaR1ElO6UhSIQx8SSpErSmdiLgJ2AEMATsz875J/RuB+4HzgceA2zLzTERcDOwBXgcksC0zj3ewfklSTW0DPyIuBO4G3gicAr4WEQcz85mWYXuAd2fm4xHxl8CtwJ8DHwU+mpmfiog7gTuBD3T6TvQbP4yVZu/l02enXWen1amXzzI8tKztuBLX5qlzhL8FeDQzXwCIiL3A9cAfVNuXAKsy8/Fq/G7g9yPiAeBngV9paf976gX+MoDBwYFad2KxzbeulcPLedcffXnGMX+54xcBeN2rV9X6mXXGdfJn9fq4Xq6t0+N6ubZOjhtasazt+waa7526407M873caxnVUs+Uv/EGzp07N+MPiIgPAudl5o5q+93AlZm5vdp+E3BPZm6utl8P7Ad+DvhGZl5UtS8HvpuZQzXq3gx8tcY4SdL3uwr4h8mNdY7wp/oVNl6jv91+M/kGzYKPAGdr7iNJpVsGrKeZod+nTuAfphm+E9YDz0/qv2CK/qPA2ohYlplnp9hvJqeY4reTJKmt70zXUee0zAPA1RHRiIjVwHXAIxOdmfkccDIiNlVNNwMPZ+ZpmtMyN7a2z6F4SVIHtA38zDwM3AEcBA4Bo5n5RETsj4grqmHbgJ0R8SxwHrCran8vsD0inqH5V8KOTt8BSVI9bT+0lSQtDV5pK0mFMPAlqRAGviQVwsCXpEK4Hv4cRcTlwOOZOdztWqZTnSr7EWAFMAa8szqNtqe0W5yvl0TE7wFvrza/mJm/08166oiIe4BGZt7S7VqmExHXAnfRPMvvS5n5m92taHoR8WvAB6vNhzPz/d2sZzY8wp+D6nqEP6MZUL3sE8C7MnNjdXtXm/GLrmVxvs3AZTRP4720u1VNLSK2AL8IXA5sBN4YEW/rblUzi4irgVu6XcdMIuKHgL8AtgI/DvxkRLylu1VNrXrv76K5dMxlwFXV66IvGPhzcy+ws9tFzCQihoEdmfl01fQ0cHEXS5rOK4vzZeYJYGJxvl50BLg9M1+uLix8lt58TAGIiNfQ/GX6oW7X0sbbgL/JzP+oHtcbga93uabpLKOZm+fR/Mt5BdA3S246pTNLEfFWYHVm7o2Ibpczrcw8RXPZaiJikOafy5/rZk3T2EAzSCccAa7sUi0zysx/nrgdET9CM5h+pnsVtfUxmhdN/kC3C2nj9cDLEfElmsu0PERzKfWek5nHqqXev0Uz6P8O+FpXi5oFA38aEXED338U/y1gLc2j0p4xXa2ZuSUihoAHaT7XvXikN59F9roiIn4M+CLw/sz8l27XM5VqVdt/z8yvRMQt3a6njeU0l1J/M3Ac2Ae8g+aS6j0lIn4CeCdwCfB/NA+q3g/c08266jLwp5GZnwE+09pWvYk+CDw2cXQfEYeAqzLz2KIXWZmqVoCIWAN8nuYHtlurP5d7TbvF+XpK9UH4Z4HfysxPdbueGdwIrK9en68B1kTEzsz87S7XNZX/BA5k5lGAiPgczb/ydnezqGn8EvCVzPwvgIjYTXMJGQN/qcnMB4AHJrYj4lz1gWiv2gN8G3hPZvbqGhoHgLsiogGcoLk43/buljS1iPgBmtNiN2bmo92uZyaZ+QsTt6sj/Df3aNgDfAF4MCJeBRwD3kJvTj8CPAV8OCLOA74LXMs0SxH3Ij+0XaKq00a3ApuAJyPiUETs73JZ32e6xfm6W9W03g+sBP60ejwPRcRt3S6q32Xm14EP01wS/RngOeDjXS1qGpn5ZeCTwDdpngixAvjjrhY1Cy6eJkmF8Ahfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcLAl6RCGPiSVIj/B2kl2CPiXnJXAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.bar(x=x,height=pmf,align='edge',width=-0.43)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The leftmost peak came from the first sketch, with data centered around 0.0. The smaller, rightmost peak came from our second sketch, which had half as many samples and was centered around 4.0. The KLL sketch captures the shape of the combiend distribution." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/jupyter/ThetaSketchNotebook.ipynb b/python/jupyter/ThetaSketchNotebook.ipynb deleted file mode 100644 index 7a3e3209..00000000 --- a/python/jupyter/ThetaSketchNotebook.ipynb +++ /dev/null @@ -1,403 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Theta Sketch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Sketch Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from datasketches import theta_sketch, update_theta_sketch, compact_theta_sketch\n", - "from datasketches import theta_union, theta_intersection, theta_a_not_b" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To start, we'll create a sketch with 1 million points in order to demonstrate basic sketch operations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Theta sketch summary:\n", - " num retained entries : 6560\n", - " seed hash : 37836\n", - " empty? : false\n", - " ordered? : false\n", - " estimation mode? : true\n", - " theta (fraction) : 0.00654224\n", - " theta (raw 64-bit) : 60341508738660257\n", - " estimate : 1.00271e+06\n", - " lower bound 95% conf : 978261\n", - " upper bound 95% conf : 1.02778e+06\n", - " lg nominal size : 12\n", - " lg current size : 13\n", - " resize factor : 8\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "n = 1000000\n", - "k = 12\n", - "sk1 = update_theta_sketch(k)\n", - "for i in range(0, n):\n", - " sk1.update(i)\n", - "print(sk1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The summary contains most data fo interest, but we can also query for specific information. And in this case, since we know the exact number of distinct items presented ot the sketch, we can look at the estimate, upper, and lower bounds as a percentage of the exact value." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Upper bound (1 std. dev) as % of true value:\t 101.5208\n", - "Sketch estimate as % of true value:\t\t 100.2715\n", - "Lower bound (1 std. dev) as % of true value:\t 99.0374\n" - ] - } - ], - "source": [ - "print(\"Upper bound (1 std. dev) as % of true value:\\t\", round(100*sk1.get_upper_bound(1) / n, 4))\n", - "print(\"Sketch estimate as % of true value:\\t\\t\", round(100*sk1.get_estimate() / n, 4))\n", - "print(\"Lower bound (1 std. dev) as % of true value:\\t\", round(100*sk1.get_lower_bound(1) / n, 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "52504" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sk1_bytes = sk1.compact().serialize()\n", - "len(sk1_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimate: \t\t 1002714.745231455\n", - "Estimation mode: \t True\n" - ] - } - ], - "source": [ - "new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n", - "print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n", - "print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sketch Unions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Theta Sketch unions make use of a separate union object. The union will accept input sketches with different values of $k$.\n", - "\n", - "For this example, we will create a sketch with distinct values that partially overlap those in `sk1`." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Theta sketch summary:\n", - " num retained entries : 12488\n", - " seed hash : 37836\n", - " empty? : false\n", - " ordered? : false\n", - " estimation mode? : true\n", - " theta (fraction) : 0.0123336\n", - " theta (raw 64-bit) : 113757656857900725\n", - " estimate : 1.01252e+06\n", - " lower bound 95% conf : 994626\n", - " upper bound 95% conf : 1.03073e+06\n", - " lg nominal size : 13\n", - " lg current size : 14\n", - " resize factor : 8\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "offset = int(3 * n / 4)\n", - "sk2 = update_theta_sketch(k+1)\n", - "for i in range(0, n):\n", - " sk2.update(i + offset)\n", - "print(sk2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now feed the sketches into the union. As constructed, the exact number of unique values presented to the two sketches is $\\frac{7}{4}n$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Union estimate as % of true value: 99.6787\n" - ] - } - ], - "source": [ - "union = theta_union(k)\n", - "union.update(sk1)\n", - "union.update(sk2)\n", - "result = union.get_result()\n", - "print(\"Union estimate as % of true value: \", round(100*result.get_estimate()/(1.75*n), 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sketch Intersections" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Beyond unions, theta sketches also support intersctions through the use of an intersection object. These set intersections can have vastly superior error bounds than the classic inclusion-exclusion rule used with sketches like HLL." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Has result: True\n", - "### Theta sketch summary:\n", - " num retained entries : 1668\n", - " seed hash : 37836\n", - " empty? : false\n", - " ordered? : true\n", - " estimation mode? : true\n", - " theta (fraction) : 0.00654224\n", - " theta (raw 64-bit) : 60341508738660257\n", - " estimate : 254959\n", - " lower bound 95% conf : 242739\n", - " upper bound 95% conf : 267789\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "intersection = theta_intersection()\n", - "intersection.update(sk1)\n", - "intersection.update(sk2)\n", - "print(\"Has result: \", intersection.has_result())\n", - "result = intersection.get_result()\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we expect the sets to have an overlap of $\\frac{1}{4}n$." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Intersection estimate as % of true value: 101.9834\n" - ] - } - ], - "source": [ - "print(\"Intersection estimate as % of true value: \", round(100*result.get_estimate()/(0.25*n), 4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set Subtraction (A-not-B)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we have the set subtraction operation. Unlike `theta_union` and `theta_intersection`, `theta_a_not_b` always takes as input 2 sketches at a time, namely $a$ and $b$, and directly returns the result as a sketch." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Theta sketch summary:\n", - " num retained entries : 4892\n", - " seed hash : 37836\n", - " empty? : false\n", - " ordered? : true\n", - " estimation mode? : true\n", - " theta (fraction) : 0.00654224\n", - " theta (raw 64-bit) : 60341508738660257\n", - " estimate : 747756\n", - " lower bound 95% conf : 726670\n", - " upper bound 95% conf : 769452\n", - "### End sketch summary\n", - "\n" - ] - } - ], - "source": [ - "anb = theta_a_not_b()\n", - "result = anb.compute(sk1, sk2)\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By using the same two sketches as before, the expected result here is $\\frac{3}{4}n$." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A-not-B estimate as % of true value: 99.7008\n" - ] - } - ], - "source": [ - "print(\"A-not-B estimate as % of true value: \", round(100*result.get_estimate()/(0.75*n), 4))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.6 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/pybind11Path.cmd b/python/pybind11Path.cmd deleted file mode 100644 index 354c88d3..00000000 --- a/python/pybind11Path.cmd +++ /dev/null @@ -1,21 +0,0 @@ -:: Licensed to the Apache Software Foundation (ASF) under one -:: or more contributor license agreements. See the NOTICE file -:: distributed with this work for additional information -:: regarding copyright ownership. The ASF licenses this file -:: to you under the Apache License, Version 2.0 (the -:: "License"); you may not use this file except in compliance -:: with the License. You may obtain a copy of the License at -:: -:: http://www.apache.org/licenses/LICENSE-2.0 -:: -:: Unless required by applicable law or agreed to in writing, -:: software distributed under the License is distributed on an -:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -:: KIND, either express or implied. See the License for the -:: specific language governing permissions and limitations -:: under the License. - - -@echo off -:: Takes path to the Python interpreter and returns the path to pybind11 -%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())" \ No newline at end of file diff --git a/python/src/__init__.py b/python/src/__init__.py deleted file mode 100644 index 756e6938..00000000 --- a/python/src/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name = "datasketches" diff --git a/python/src/count_wrapper.cpp b/python/src/count_wrapper.cpp deleted file mode 100644 index bebb4696..00000000 --- a/python/src/count_wrapper.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -#include "count_min.hpp" -#include "common_defs.hpp" - -namespace py = pybind11; - -template -void bind_count_min_sketch(py::module &m, const char* name) { - using namespace datasketches; - - py::class_>(m, name) - .def(py::init(), py::arg("num_hashes"), py::arg("num_buckets"), py::arg("seed")=DEFAULT_SEED) - .def(py::init&>()) - .def_static("suggest_num_buckets", &count_min_sketch::suggest_num_buckets, py::arg("relative_error"), - "Suggests the number of buckets needed to achieve an accuracy within the provided " - "relative_error. For example, when relative_error = 0.05, the returned frequency estimates " - "satisfy the 'relative_error' guarantee that never overestimates the weights but may " - "underestimate the weights by 5% of the total weight in the sketch. " - "Returns the number of hash buckets at every level of the sketch required in order to obtain " - "the specified relative error.") - .def_static("suggest_num_hashes", &count_min_sketch::suggest_num_hashes, py::arg("confidence"), - "Suggests the number of hashes needed to achieve the provided confidence. For example, " - "with 95% confidence, frequency estimates satisfy the 'relative_error' guarantee. " - "Returns the number of hash functions that are required in order to achieve the specified " - "confidence of the sketch. confidence = 1 - delta, with delta denoting the sketch failure probability.") - .def("__str__", &count_min_sketch::to_string, - "Produces a string summary of the sketch") - .def("to_string", &count_min_sketch::to_string, - "Produces a string summary of the sketch") - .def("is_empty", &count_min_sketch::is_empty, - "Returns True if the sketch has seen no items, otherwise False") - .def("get_num_hashes", &count_min_sketch::get_num_hashes, - "Returns the configured number of hashes for the sketch") - .def("get_num_buckets", &count_min_sketch::get_num_buckets, - "Returns the configured number of buckets for the sketch") - .def("get_seed", &count_min_sketch::get_seed, - "Returns the base hash seed for the sketch") - .def("get_relative_error", &count_min_sketch::get_relative_error, - "Returns the maximum permissible error for any frequency estimate query") - .def("get_total_weight", &count_min_sketch::get_total_weight, - "Returns the total weight currently inserted into the stream") - .def("update", static_cast::*)(int64_t, W)>(&count_min_sketch::update), py::arg("item"), py::arg("weight")=1.0, - "Updates the sketch with the given 64-bit integer value") - .def("update", static_cast::*)(const std::string&, W)>(&count_min_sketch::update), py::arg("item"), py::arg("weight")=1.0, - "Updates the sketch with the given string") - .def("get_estimate", static_cast::*)(int64_t) const>(&count_min_sketch::get_estimate), py::arg("item"), - "Returns an estimate of the frequency of the provided 64-bit integer value") - .def("get_estimate", static_cast::*)(const std::string&) const>(&count_min_sketch::get_estimate), py::arg("item"), - "Returns an estimate of the frequency of the provided string") - .def("get_upper_bound", static_cast::*)(int64_t) const>(&count_min_sketch::get_upper_bound), py::arg("item"), - "Returns an upper bound on the estimate for the given 64-bit integer value") - .def("get_upper_bound", static_cast::*)(const std::string&) const>(&count_min_sketch::get_upper_bound), py::arg("item"), - "Returns an upper bound on the estimate for the provided string") - .def("get_lower_bound", static_cast::*)(int64_t) const>(&count_min_sketch::get_lower_bound), py::arg("item"), - "Returns an lower bound on the estimate for the given 64-bit integer value") - .def("get_lower_bound", static_cast::*)(const std::string&) const>(&count_min_sketch::get_lower_bound), py::arg("item"), - "Returns an lower bound on the estimate for the provided string") - .def("merge", &count_min_sketch::merge, py::arg("other"), - "Merges the provided other sketch into this one") - .def("get_serialized_size_bytes", &count_min_sketch::get_serialized_size_bytes, - "Returns the size in bytes of the serialized image of the sketch") - .def( - "serialize", - [](const count_min_sketch& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes) { return count_min_sketch::deserialize(bytes.data(), bytes.size()); }, - py::arg("bytes"), - "Reads a bytes object and returns the corresponding count_min_sketch" - ); -} - -void init_count_min(py::module &m) { - bind_count_min_sketch(m, "count_min_sketch"); -} - diff --git a/python/src/cpc_wrapper.cpp b/python/src/cpc_wrapper.cpp deleted file mode 100644 index 7da1415b..00000000 --- a/python/src/cpc_wrapper.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -#include "cpc_sketch.hpp" -#include "cpc_union.hpp" -#include "cpc_common.hpp" -#include "common_defs.hpp" - -namespace py = pybind11; - -void init_cpc(py::module &m) { - using namespace datasketches; - - py::class_(m, "cpc_sketch") - .def(py::init(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED) - .def(py::init()) - .def("__str__", &cpc_sketch::to_string, - "Produces a string summary of the sketch") - .def("to_string", &cpc_sketch::to_string, - "Produces a string summary of the sketch") - .def("update", &cpc_sketch::update, py::arg("datum"), - "Updates the sketch with the given 64-bit integer value") - .def("update", &cpc_sketch::update, py::arg("datum"), - "Updates the sketch with the given 64-bit floating point") - .def("update", &cpc_sketch::update, py::arg("datum"), - "Updates the sketch with the given string") - .def("get_lg_k", &cpc_sketch::get_lg_k, - "Returns configured lg_k of this sketch") - .def("is_empty", &cpc_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_estimate", &cpc_sketch::get_estimate, - "Estimate of the distinct count of the input stream") - .def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"), - "Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations") - .def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"), - "Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations") - .def( - "serialize", - [](const cpc_sketch& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes) { return cpc_sketch::deserialize(bytes.data(), bytes.size()); }, - py::arg("bytes"), - "Reads a bytes object and returns the corresponding cpc_sketch" - ); - - py::class_(m, "cpc_union") - .def(py::init(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED) - .def(py::init()) - .def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"), - "Updates the union with the provided CPC sketch") - .def("get_result", &cpc_union::get_result, - "Returns a CPC sketch with the result of the union") - ; -} diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp deleted file mode 100644 index e3604525..00000000 --- a/python/src/datasketches.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -namespace py = pybind11; - -// sketches -void init_hll(py::module& m); -void init_kll(py::module& m); -void init_fi(py::module& m); -void init_cpc(py::module& m); -void init_theta(py::module& m); -void init_tuple(py::module& m); -void init_vo(py::module& m); -void init_req(py::module& m); -void init_quantiles(py::module& m); -void init_count_min(py::module& m); -void init_density(py::module& m); -void init_vector_of_kll(py::module& m); - -// supporting objects -void init_kolmogorov_smirnov(py::module& m); -void init_serde(py::module& m); - -PYBIND11_MODULE(_datasketches, m) { - init_hll(m); - init_kll(m); - init_fi(m); - init_cpc(m); - init_theta(m); - init_tuple(m); - init_vo(m); - init_req(m); - init_quantiles(m); - init_count_min(m); - init_density(m); - init_vector_of_kll(m); - - init_kolmogorov_smirnov(m); - init_serde(m); -} diff --git a/python/src/density_wrapper.cpp b/python/src/density_wrapper.cpp deleted file mode 100644 index 03b7fee9..00000000 --- a/python/src/density_wrapper.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include -#include - -#include "kernel_function.hpp" -#include "density_sketch.hpp" - -namespace py = pybind11; - -template -void bind_density_sketch(py::module &m, const char* name) { - using namespace datasketches; - - py::class_>(m, name) - .def( - py::init([](uint16_t k, uint32_t dim, std::shared_ptr kernel) { - kernel_function_holder holder(kernel); - return density_sketch(k, dim, holder); - }), - py::arg("k"), py::arg("dim"), py::arg("kernel")) - .def("update", static_cast::*)(const std::vector&)>(&density_sketch::update), - "Updates the sketch with the given vector") - .def("merge", static_cast::*)(const density_sketch&)>(&density_sketch::merge), py::arg("sketch"), - "Merges the provided sketch into this one") - .def("is_empty", &density_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_k", &density_sketch::get_k, - "Returns the configured parameter k") - .def("get_dim", &density_sketch::get_dim, - "Returns the configured parameter dim") - .def("get_n", &density_sketch::get_n, - "Returns the length of the input stream") - .def("get_num_retained", &density_sketch::get_num_retained, - "Returns the number of retained items (samples) in the sketch") - .def("is_estimation_mode", &density_sketch::is_estimation_mode, - "Returns True if the sketch is in estimation mode, otherwise False") - .def("get_estimate", &density_sketch::get_estimate, py::arg("point"), - "Returns an approximate density at the given point") - .def("__str__", &density_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &density_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("__iter__", [](const density_sketch& s){ return py::make_iterator(s.begin(), s.end()); }) - .def("serialize", - [](const density_sketch& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes, std::shared_ptr kernel) { - kernel_function_holder holder(kernel); - return density_sketch::deserialize(bytes.data(), bytes.size(), holder); - }, - py::arg("bytes"), py::arg("kernel"), - "Reads a bytes object and returns the corresponding density_sketch" - );; -} - -void init_density(py::module &m) { - using namespace datasketches; - - // generic kernel function - py::class_>(m, "KernelFunction") - .def(py::init()) - .def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b")) - ; - - // the old sketch names can almost be defined, but the kernel_function_holder won't work in init() - //bind_density_sketch>(m, "density_floats_sketch"); - //bind_density_sketch>(m, "density_doubles_sketch"); - bind_density_sketch(m, "_density_sketch"); -} diff --git a/python/src/fi_wrapper.cpp b/python/src/fi_wrapper.cpp deleted file mode 100644 index 36fa97dc..00000000 --- a/python/src/fi_wrapper.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -#include "py_serde.hpp" -#include "py_object_ostream.hpp" -#include "frequent_items_sketch.hpp" - -#include - -#include - -namespace py = pybind11; - -// forward declarations -// std::string and arithmetic types, where we don't need a separate serde -template::value || std::is_same::value, bool>::type = 0> -void add_serialization(py::class_>& clazz); - -// py::object and other types where the caller must provide a serde -template::value && !std::is_same::value, bool>::type = 0> -void add_serialization(py::class_>& clazz); - -template -void bind_fi_sketch(py::module &m, const char* name) { - using namespace datasketches; - - auto fi_class = py::class_>(m, name) - .def(py::init(), py::arg("lg_max_k")) - .def("__str__", &frequent_items_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &frequent_items_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("update", (void (frequent_items_sketch::*)(const T&, uint64_t)) &frequent_items_sketch::update, py::arg("item"), py::arg("weight")=1, - "Updates the sketch with the given string and, optionally, a weight") - .def("merge", (void (frequent_items_sketch::*)(const frequent_items_sketch&)) &frequent_items_sketch::merge, - "Merges the given sketch into this one") - .def("is_empty", &frequent_items_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_num_active_items", &frequent_items_sketch::get_num_active_items, - "Returns the number of active items in the sketch") - .def("get_total_weight", &frequent_items_sketch::get_total_weight, - "Returns the sum of the weights (frequencies) in the stream seen so far by the sketch") - .def("get_estimate", &frequent_items_sketch::get_estimate, py::arg("item"), - "Returns the estimate of the weight (frequency) of the given item.\n" - "Note: The true frequency of a item would be the sum of the counts as a result of the " - "two update functions.") - .def("get_lower_bound", &frequent_items_sketch::get_lower_bound, py::arg("item"), - "Returns the guaranteed lower bound weight (frequency) of the given item.") - .def("get_upper_bound", &frequent_items_sketch::get_upper_bound, py::arg("item"), - "Returns the guaranteed upper bound weight (frequency) of the given item.") - .def("get_sketch_epsilon", (double (frequent_items_sketch::*)(void) const) &frequent_items_sketch::get_epsilon, - "Returns the epsilon value used by the sketch to compute error") - .def( - "get_frequent_items", - [](const frequent_items_sketch& sk, frequent_items_error_type err_type, uint64_t threshold) { - if (threshold == 0) threshold = sk.get_maximum_error(); - py::list list; - auto rows = sk.get_frequent_items(err_type, threshold); - for (auto row: rows) { - list.append(py::make_tuple( - row.get_item(), - row.get_estimate(), - row.get_lower_bound(), - row.get_upper_bound()) - ); - } - return list; - }, - py::arg("err_type"), py::arg("threshold")=0 - ) - .def_static( - "get_epsilon_for_lg_size", - [](uint8_t lg_max_map_size) { return frequent_items_sketch::get_epsilon(lg_max_map_size); }, - py::arg("lg_max_map_size"), - "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)" - ) - .def_static( - "get_apriori_error", - &frequent_items_sketch::get_apriori_error, - py::arg("lg_max_map_size"), py::arg("estimated_total_weight"), - "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight." - ); - - // serialization may need a caller-provided serde depending on the sketch type, so - // we use a separate method to handle that appropriately based on type T. - add_serialization(fi_class); -} - -// std::string or arithmetic types, for which we have a built-in serde -template::value || std::is_same::value, bool>::type> -void add_serialization(py::class_>& clazz) { - using namespace datasketches; - clazz.def( - "get_serialized_size_bytes", - [](const frequent_items_sketch& sk) { return sk.get_serialized_size_bytes(); }, - "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at." - ) - .def( - "serialize", - [](const frequent_items_sketch& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object." - ) - .def_static( - "deserialize", - [](const std::string& bytes) { return frequent_items_sketch::deserialize(bytes.data(), bytes.size()); }, - py::arg("bytes"), - "Reads a bytes object and returns the corresponding frequent_strings_sketch." - ); -} - -// py::object or any other type that requires a provided serde -template::value && !std::is_same::value, bool>::type> -void add_serialization(py::class_>& clazz) { - using namespace datasketches; - clazz.def( - "get_serialized_size_bytes", - [](const frequent_items_sketch& sk, py_object_serde& serde) { return sk.get_serialized_size_bytes(serde); }, - py::arg("serde"), - "Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at." - ) - .def( - "serialize", - [](const frequent_items_sketch& sk, py_object_serde& serde) { - auto bytes = sk.serialize(0, serde); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, py::arg("serde"), - "Serializes the sketch into a bytes object using the provided serde." - ) - .def_static( - "deserialize", - [](const std::string& bytes, py_object_serde& serde) { - return frequent_items_sketch::deserialize(bytes.data(), bytes.size(), serde); - }, py::arg("bytes"), py::arg("serde"), - "Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch." - ); -} - -// calls class __hash__ method -struct py_hash_caller { - size_t operator()(const py::object& a) const { - return py::hash(a); - } -}; - -// calls class __eq__ method -struct py_equal_caller { - bool operator()(const py::object& a, const py::object& b) const { - return a.equal(b); - } -}; - -void init_fi(py::module &m) { - using namespace datasketches; - - py::enum_(m, "frequent_items_error_type") - .value("NO_FALSE_POSITIVES", NO_FALSE_POSITIVES) - .value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES) - .export_values(); - - bind_fi_sketch, std::equal_to>(m, "frequent_strings_sketch"); - bind_fi_sketch(m, "frequent_items_sketch"); -} diff --git a/python/src/hll_wrapper.cpp b/python/src/hll_wrapper.cpp deleted file mode 100644 index 52690b20..00000000 --- a/python/src/hll_wrapper.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -#include "hll.hpp" - -namespace py = pybind11; - -void init_hll(py::module &m) { - using namespace datasketches; - - py::enum_(m, "tgt_hll_type", "Target HLL flavor") - .value("HLL_4", HLL_4) - .value("HLL_6", HLL_6) - .value("HLL_8", HLL_8) - .export_values(); - - py::class_(m, "hll_sketch") - .def(py::init(), py::arg("lg_k")) - .def(py::init(), py::arg("lg_k"), py::arg("tgt_type")) - .def(py::init(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false) - .def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string, - py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false, - "Produces a string summary of the sketch") - .def("to_string", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string, - py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false, - "Produces a string summary of the sketch") - .def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k, "Configured lg_k value for the sketch") - .def_property_readonly("tgt_type", &hll_sketch::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode") - .def("get_estimate", &hll_sketch::get_estimate, - "Estimate of the distinct count of the input stream") - .def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"), - "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}") - .def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"), - "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}") - .def("is_compact", &hll_sketch::is_compact, - "True if the sketch is compact, otherwise False") - .def("is_empty", &hll_sketch::is_empty, - "True if the sketch is empty, otherwise False") - .def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes, - "Returns the size of the serialized sketch") - .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes, - "Returns the size of the serialized sketch when compressing the exception table if HLL_4") - .def("reset", &hll_sketch::reset, - "Resets the sketch to the empty state in coupon collection mode") - .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"), - "Updates the sketch with the given integral value") - .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"), - "Updates the sketch with the given floating point value") - .def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"), - "Updates the sketch with the given string value") - .def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes, - py::arg("lg_k"), py::arg("tgt_type"), - "Provides a likely upper bound on serialization size for the given parameters") - .def_static("get_rel_err", &hll_sketch::get_rel_err, - py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"), - "Returns the a priori relative error bound for the given parameters") - .def( - "serialize_compact", - [](const hll_sketch& sk) { - auto bytes = sk.serialize_compact(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object, compressing the exception table if HLL_4" - ) - .def( - "serialize_updatable", - [](const hll_sketch& sk) { - auto bytes = sk.serialize_updatable(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); }, - py::arg("bytes"), - "Reads a bytes object and returns the corresponding hll_sketch" - ); - - py::class_(m, "hll_union") - .def(py::init(), py::arg("lg_max_k")) - .def_property_readonly("lg_config_k", &hll_union::get_lg_config_k, "Configured lg_k value for the union") - .def_property_readonly("tgt_type", &hll_union::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode") - .def("get_estimate", &hll_union::get_estimate, - "Estimate of the distinct count of the input stream") - .def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"), - "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}") - .def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"), - "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}") - .def("is_empty", &hll_union::is_empty, - "True if the union is empty, otherwise False") - .def("reset", &hll_union::reset, - "Resets the union to the empty state") - .def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4, - "Returns a sketch of the target type representing the current union state") - .def("update", &hll_union::update, py::arg("sketch"), - "Updates the union with the given HLL sketch") - .def("update", &hll_union::update, py::arg("datum"), - "Updates the union with the given integral value") - .def("update", &hll_union::update, py::arg("datum"), - "Updates the union with the given floating point value") - .def("update", &hll_union::update, py::arg("datum"), - "Updates the union with the given string value") - .def_static("get_rel_err", &hll_union::get_rel_err, - py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"), - "Returns the a priori relative error bound for the given parameters") - ; -} diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp deleted file mode 100644 index e80e921d..00000000 --- a/python/src/kll_wrapper.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "py_object_lt.hpp" -#include "py_object_ostream.hpp" -#include "quantile_conditional.hpp" -#include "kll_sketch.hpp" - -#include -#include -#include -#include - -namespace py = pybind11; - -template -void bind_kll_sketch(py::module &m, const char* name) { - using namespace datasketches; - - auto kll_class = py::class_>(m, name) - .def(py::init(), py::arg("k")=kll_constants::DEFAULT_K) - .def(py::init&>()) - .def( - "update", - static_cast::*)(const T&)>(&kll_sketch::update), - py::arg("item"), - "Updates the sketch with the given value" - ) - .def("merge", (void (kll_sketch::*)(const kll_sketch&)) &kll_sketch::merge, py::arg("sketch"), - "Merges the provided sketch into this one") - .def("__str__", &kll_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &kll_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("is_empty", &kll_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_k", &kll_sketch::get_k, - "Returns the configured parameter k") - .def("get_n", &kll_sketch::get_n, - "Returns the length of the input stream") - .def("get_num_retained", &kll_sketch::get_num_retained, - "Returns the number of retained items (samples) in the sketch") - .def("is_estimation_mode", &kll_sketch::is_estimation_mode, - "Returns True if the sketch is in estimation mode, otherwise False") - .def("get_min_value", &kll_sketch::get_min_item, - "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError") - .def("get_max_value", &kll_sketch::get_max_item, - "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError") - .def("get_quantile", &kll_sketch::get_quantile, py::arg("rank"), py::arg("inclusive")=false, - "Returns an approximation to the data value " - "associated with the given normalized rank in a hypothetical sorted " - "version of the input stream so far.\n" - "For kll_floats_sketch: if the sketch is empty this returns nan. " - "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.") - .def( - "get_quantiles", - [](const kll_sketch& sk, const std::vector& ranks, bool inclusive) { - return sk.get_quantiles(ranks.data(), ranks.size(), inclusive); - }, - py::arg("ranks"), py::arg("inclusive")=false, - "This returns an array that could have been generated by using get_quantile() for each " - "normalized rank separately.\n" - "If the sketch is empty this returns an empty vector.\n" - "Deprecated. Will be removed in the next major version. Use get_quantile() instead." - ) - .def("get_rank", &kll_sketch::get_rank, py::arg("value"), py::arg("inclusive")=false, - "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n" - "The resulting approximation has a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(False) function.\n" - "With the parameter inclusive=true the weight of the given value is included into the rank." - "Otherwise the rank equals the sum of the weights of values less than the given value.\n" - "If the sketch is empty this returns nan.") - .def( - "get_pmf", - [](const kll_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_PMF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Probability Mass Function (PMF) of the input stream " - "given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and " - "inclusive of the right split point.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def( - "get_cdf", - [](const kll_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_CDF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Cumulative Distribution Function (CDF), which is the " - "cumulative analog of the PMF, of the input stream given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and " - "inclusive of the right split point.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def( - "normalized_rank_error", - static_cast::*)(bool) const>(&kll_sketch::get_normalized_rank_error), - py::arg("as_pmf"), - "Gets the normalized rank error for this sketch.\n" - "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n" - "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n" - "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials" - ) - .def_static( - "get_normalized_rank_error", - [](uint16_t k, bool pmf) { return kll_sketch::get_normalized_rank_error(k, pmf); }, - py::arg("k"), py::arg("as_pmf"), - "Gets the normalized rank error given parameters k and the pmf flag.\n" - "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n" - "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n" - "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials" - ) - .def("__iter__", [](const kll_sketch& s) { return py::make_iterator(s.begin(), s.end()); }); - - add_serialization(kll_class); - add_vector_update(kll_class); -} - -void init_kll(py::module &m) { - bind_kll_sketch>(m, "kll_ints_sketch"); - bind_kll_sketch>(m, "kll_floats_sketch"); - bind_kll_sketch>(m, "kll_doubles_sketch"); - bind_kll_sketch(m, "kll_items_sketch"); -} diff --git a/python/src/ks_wrapper.cpp b/python/src/ks_wrapper.cpp deleted file mode 100644 index eea0c743..00000000 --- a/python/src/ks_wrapper.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "kolmogorov_smirnov.hpp" -#include "kll_sketch.hpp" -#include "quantiles_sketch.hpp" - -#include - -namespace py = pybind11; - -void init_kolmogorov_smirnov(py::module &m) { - using namespace datasketches; - - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); - - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); - m.def("ks_test", &kolmogorov_smirnov::test>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"), - "Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n" - "Note: if the given sketches have insufficient data or if the sketch sizes are too small, " - "this will return false.\n" - "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying " - "distribution) using the provided p-value, otherwise False."); -} diff --git a/python/src/py_serde.cpp b/python/src/py_serde.cpp deleted file mode 100644 index 7219ed08..00000000 --- a/python/src/py_serde.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include "memory_operations.hpp" - -#include "py_serde.hpp" - -#include - -namespace py = pybind11; - -void init_serde(py::module& m) { - using namespace datasketches; - py::class_(m, "PyObjectSerDe") - .def(py::init<>()) - .def("get_size", &py_object_serde::get_size, py::arg("item"), - "Returns the size in bytes of an item") - .def("to_bytes", &py_object_serde::to_bytes, py::arg("item"), - "Retuns a bytes object with a serialized version of an item") - .def("from_bytes", &py_object_serde::from_bytes, py::arg("data"), py::arg("offset"), - "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed " - "object and the number of additional bytes read") - ; -} - -namespace datasketches { - size_t py_object_serde::size_of_item(const py::object& item) const { - return get_size(item); - } - - size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const { - size_t bytes_written = 0; - py::gil_scoped_acquire acquire; - for (unsigned i = 0; i < num; ++i) { - std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes - check_memory_size(bytes_written + bytes.size(), capacity); - memcpy(ptr, bytes.c_str(), bytes.size()); - ptr = static_cast(ptr) + bytes.size(); - bytes_written += bytes.size(); - } - py::gil_scoped_release release; - return bytes_written; - } - - size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const { - size_t bytes_read = 0; - unsigned i = 0; - bool failure = false; - bool error_from_python = false; - py::gil_scoped_acquire acquire; - - // copy data into bytes only once - py::bytes bytes(static_cast(ptr), capacity); - for (; i < num && !failure; ++i) { - py::tuple bytes_and_len; - try { - bytes_and_len = from_bytes(bytes, bytes_read); - } catch (py::error_already_set &e) { - failure = true; - error_from_python = true; - break; - } - - size_t length = py::cast(bytes_and_len[1]); - if (bytes_read + length > capacity) { - bytes_read += length; // use this value to report the error - failure = true; - break; - } - - new (&items[i]) py::object(py::cast(bytes_and_len[0])); - ptr = static_cast(ptr) + length; - bytes_read += length; - } - - if (failure) { - // clean up what we've allocated - for (unsigned j = 0; j < i; ++j) { - items[j].dec_ref(); - } - - if (error_from_python) { - throw py::value_error("Error reading value in from_bytes"); - } else { - // this next call will throw - check_memory_size(bytes_read, capacity); - } - } - - py::gil_scoped_release release; - return bytes_read; - } - - -} // namespace datasketches \ No newline at end of file diff --git a/python/src/quantiles_wrapper.cpp b/python/src/quantiles_wrapper.cpp deleted file mode 100644 index c4a9f522..00000000 --- a/python/src/quantiles_wrapper.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "py_object_lt.hpp" -#include "py_object_ostream.hpp" -#include "quantile_conditional.hpp" -#include "quantiles_sketch.hpp" - -#include -#include -#include -#include -#include - -namespace py = pybind11; - -template -void bind_quantiles_sketch(py::module &m, const char* name) { - using namespace datasketches; - - auto quantiles_class = py::class_>(m, name) - .def(py::init(), py::arg("k")=quantiles_constants::DEFAULT_K) - .def(py::init&>()) - .def( - "update", - static_cast::*)(const T&)>(&quantiles_sketch::update), - py::arg("item"), - "Updates the sketch with the given value" - ) - .def("merge", (void (quantiles_sketch::*)(const quantiles_sketch&)) &quantiles_sketch::merge, py::arg("sketch"), - "Merges the provided sketch into this one") - .def("__str__", &quantiles_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &quantiles_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("is_empty", &quantiles_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_k", &quantiles_sketch::get_k, - "Returns the configured parameter k") - .def("get_n", &quantiles_sketch::get_n, - "Returns the length of the input stream") - .def("get_num_retained", &quantiles_sketch::get_num_retained, - "Returns the number of retained items (samples) in the sketch") - .def("is_estimation_mode", &quantiles_sketch::is_estimation_mode, - "Returns True if the sketch is in estimation mode, otherwise False") - .def("get_min_value", &quantiles_sketch::get_min_item, - "Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError") - .def("get_max_value", &quantiles_sketch::get_max_item, - "Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError") - .def("get_quantile", &quantiles_sketch::get_quantile, py::arg("rank"), py::arg("inclusive")=false, - "Returns an approximation to the data value " - "associated with the given rank in a hypothetical sorted " - "version of the input stream so far.\n" - "For quantiles_floats_sketch: if the sketch is empty this returns nan. " - "For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.") - .def( - "get_quantiles", - [](const quantiles_sketch& sk, const std::vector& ranks, bool inclusive) { - return sk.get_quantiles(ranks.data(), ranks.size(), inclusive); - }, - py::arg("ranks"), py::arg("inclusive")=false, - "This returns an array that could have been generated by using get_quantile() for each " - "normalized rank separately.\n" - "If the sketch is empty this returns an empty vector.\n" - "Deprecated. Will be removed in the next major version. Use get_quantile() instead." - ) - .def("get_rank", &quantiles_sketch::get_rank, py::arg("value"), py::arg("inclusive")=false, - "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n" - "The resulting approximation has a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(False) function.\n" - "With the parameter inclusive=true the weight of the given value is included into the rank." - "Otherwise the rank equals the sum of the weights of values less than the given value.\n" - "If the sketch is empty this returns nan.") - .def( - "get_pmf", - [](const quantiles_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_PMF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Probability Mass Function (PMF) of the input stream " - "given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "The definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def( - "get_cdf", - [](const quantiles_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_CDF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Cumulative Distribution Function (CDF), which is the " - "cumulative analog of the PMF, of the input stream given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "The definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def( - "normalized_rank_error", - static_cast::*)(bool) const>(&quantiles_sketch::get_normalized_rank_error), - py::arg("as_pmf"), - "Gets the normalized rank error for this sketch.\n" - "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n" - "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n" - "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials" - ) - .def_static( - "get_normalized_rank_error", - [](uint16_t k, bool pmf) { return quantiles_sketch::get_normalized_rank_error(k, pmf); }, - py::arg("k"), py::arg("as_pmf"), - "Gets the normalized rank error given parameters k and the pmf flag.\n" - "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n" - "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n" - "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials" - ) - .def("__iter__", [](const quantiles_sketch& s) { return py::make_iterator(s.begin(), s.end()); }); - - add_serialization(quantiles_class); - add_vector_update(quantiles_class); -} - -void init_quantiles(py::module &m) { - bind_quantiles_sketch>(m, "quantiles_ints_sketch"); - bind_quantiles_sketch>(m, "quantiles_floats_sketch"); - bind_quantiles_sketch>(m, "quantiles_doubles_sketch"); - bind_quantiles_sketch(m, "quantiles_items_sketch"); -} diff --git a/python/src/req_wrapper.cpp b/python/src/req_wrapper.cpp deleted file mode 100644 index 30368abf..00000000 --- a/python/src/req_wrapper.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "py_object_lt.hpp" -#include "py_object_ostream.hpp" -#include "quantile_conditional.hpp" -#include "req_sketch.hpp" - -#include -#include -#include -#include -#include - -namespace py = pybind11; - -template -void bind_req_sketch(py::module &m, const char* name) { - using namespace datasketches; - - auto req_class = py::class_>(m, name) - .def(py::init(), py::arg("k")=12, py::arg("is_hra")=true) - .def(py::init&>()) - .def("update", (void (req_sketch::*)(const T&)) &req_sketch::update, py::arg("item"), - "Updates the sketch with the given value") - .def("merge", (void (req_sketch::*)(const req_sketch&)) &req_sketch::merge, py::arg("sketch"), - "Merges the provided sketch into this one") - .def("__str__", &req_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &req_sketch::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("is_hra", &req_sketch::is_HRA, - "Returns True if the sketch is in High Rank Accuracy mode, otherwise False") - .def("is_empty", &req_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_k", &req_sketch::get_k, - "Returns the configured parameter k") - .def("get_n", &req_sketch::get_n, - "Returns the length of the input stream") - .def("get_num_retained", &req_sketch::get_num_retained, - "Returns the number of retained items (samples) in the sketch") - .def("is_estimation_mode", &req_sketch::is_estimation_mode, - "Returns True if the sketch is in estimation mode, otherwise False") - .def("get_min_value", &req_sketch::get_min_item, - "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError") - .def("get_max_value", &req_sketch::get_max_item, - "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError") - .def("get_quantile", &req_sketch::get_quantile, py::arg("rank"), py::arg("inclusive")=false, - "Returns an approximation to the data value " - "associated with the given normalized rank in a hypothetical sorted " - "version of the input stream so far.\n" - "For req_floats_sketch: if the sketch is empty this returns nan. " - "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.") - .def( - "get_quantiles", - [](const req_sketch& sk, const std::vector& ranks, bool inclusive) { - return sk.get_quantiles(ranks.data(), ranks.size(), inclusive); - }, - py::arg("ranks"), py::arg("inclusive")=false, - "This returns an array that could have been generated by using get_quantile() for each " - "normalized rank separately.\n" - "If the sketch is empty this returns an empty vector.\n" - "Deprecated. Will be removed in the next major version. Use get_quantile() instead." - ) - .def("get_rank", &req_sketch::get_rank, py::arg("value"), py::arg("inclusive")=false, - "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n" - "The resulting approximation has a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(False) function.\n" - "With the parameter inclusive=true the weight of the given value is included into the rank." - "Otherwise the rank equals the sum of the weights of values less than the given value.\n" - "If the sketch is empty this returns nan.") - .def( - "get_pmf", - [](const req_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_PMF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Probability Mass Function (PMF) of the input stream " - "given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and " - "inclusive of the right split point.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def( - "get_cdf", - [](const req_sketch& sk, const std::vector& split_points, bool inclusive) { - return sk.get_CDF(split_points.data(), split_points.size(), inclusive); - }, - py::arg("split_points"), py::arg("inclusive")=false, - "Returns an approximation to the Cumulative Distribution Function (CDF), which is the " - "cumulative analog of the PMF, of the input stream given a set of split points (values).\n" - "The resulting approximations have a probabilistic guarantee that can be obtained from the " - "get_normalized_rank_error(True) function.\n" - "If the sketch is empty this returns an empty vector.\n" - "split_points is an array of m unique, monotonically increasing float values " - "that divide the real number line into m+1 consecutive disjoint intervals.\n" - "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and " - "exclusive of the right split point, with the exception that the last interval will include " - "the maximum value.\n" - "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and " - "inclusive of the right split point.\n" - "It is not necessary to include either the min or max values in these split points." - ) - .def("get_rank_lower_bound", &req_sketch::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"), - "Returns an approximate lower bound on the given normalized rank.\n" - "Normalized rank must be a value between 0.0 and 1.0 (inclusive); " - "the number of standard deviations must be 1, 2, or 3.") - .def("get_rank_upper_bound", &req_sketch::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"), - "Returns an approximate upper bound on the given normalized rank.\n" - "Normalized rank must be a value between 0.0 and 1.0 (inclusive); " - "the number of standard deviations must be 1, 2, or 3.") - .def_static("get_RSE", &req_sketch::get_RSE, - py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"), - "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). " - "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been " - "modified based on empirical measurements, for a given value of parameter k.\n" - "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high " - "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points " - "provided to the sketch.") - .def("__iter__", [](const req_sketch& s) { return py::make_iterator(s.begin(), s.end()); }); - - add_serialization(req_class); - add_vector_update(req_class); -} - -void init_req(py::module &m) { - bind_req_sketch>(m, "req_ints_sketch"); - bind_req_sketch>(m, "req_floats_sketch"); - bind_req_sketch(m, "req_items_sketch"); -} diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp deleted file mode 100644 index 033e6ca8..00000000 --- a/python/src/theta_wrapper.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include - -#include "theta_sketch.hpp" -#include "theta_union.hpp" -#include "theta_intersection.hpp" -#include "theta_a_not_b.hpp" -#include "theta_jaccard_similarity.hpp" -#include "common_defs.hpp" - -namespace py = pybind11; - -void init_theta(py::module &m) { - using namespace datasketches; - - py::class_(m, "theta_sketch") - .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("is_empty", &theta_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_estimate", &theta_sketch::get_estimate, - "Estimate of the distinct count of the input stream") - .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"), - "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}") - .def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"), - "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}") - .def("is_estimation_mode", &theta_sketch::is_estimation_mode, - "Returns True if sketch is in estimation mode, otherwise False") - .def("get_theta", &theta_sketch::get_theta, - "Returns theta (effective sampling rate) as a fraction from 0 to 1") - .def("get_theta64", &theta_sketch::get_theta64, - "Returns theta as 64-bit value") - .def("get_num_retained", &theta_sketch::get_num_retained, - "Returns the number of items currently in the sketch") - .def("get_seed_hash", &theta_sketch::get_seed_hash, - "Returns a hash of the seed used in the sketch") - .def("is_ordered", &theta_sketch::is_ordered, - "Returns True if the sketch entries are sorted, otherwise False") - .def("__iter__", [](const theta_sketch& s) { return py::make_iterator(s.begin(), s.end()); }) - ; - - py::class_(m, "update_theta_sketch") - .def( - py::init([](uint8_t lg_k, double p, uint64_t seed) { - return update_theta_sketch::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build(); - }), - py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED - ) - .def(py::init()) - .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"), - "Updates the sketch with the given integral value") - .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"), - "Updates the sketch with the given floating point value") - .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"), - "Updates the sketch with the given string") - .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true, - "Returns a compacted form of the sketch, optionally sorting it") - .def("trim", &update_theta_sketch::trim, "Removes retained entries in excess of the nominal size k (if any)") - .def("reset", &update_theta_sketch::reset, "Resets the sketch to the initial empty state") - ; - - py::class_(m, "compact_theta_sketch") - .def(py::init()) - .def(py::init()) - .def( - "serialize", - [](const compact_theta_sketch& sk) { - auto bytes = sk.serialize(); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes, uint64_t seed) { - return compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed); - }, - py::arg("bytes"), py::arg("seed")=DEFAULT_SEED, - "Reads a bytes object and returns the corresponding compact_theta_sketch" - ); - - py::class_(m, "theta_union") - .def( - py::init([](uint8_t lg_k, double p, uint64_t seed) { - return theta_union::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build(); - }), - py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED - ) - .def("update", &theta_union::update, py::arg("sketch"), - "Updates the union with the given sketch") - .def("get_result", &theta_union::get_result, py::arg("ordered")=true, - "Returns the sketch corresponding to the union result") - ; - - py::class_(m, "theta_intersection") - .def(py::init(), py::arg("seed")=DEFAULT_SEED) - .def(py::init()) - .def("update", &theta_intersection::update, py::arg("sketch"), - "Intersections the provided sketch with the current intersection state") - .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true, - "Returns the sketch corresponding to the intersection result") - .def("has_result", &theta_intersection::has_result, - "Returns True if the intersection has a valid result, otherwise False") - ; - - py::class_(m, "theta_a_not_b") - .def(py::init(), py::arg("seed")=DEFAULT_SEED) - .def( - "compute", - &theta_a_not_b::compute, - py::arg("a"), py::arg("b"), py::arg("ordered")=true, - "Returns a sketch with the result of applying the A-not-B operation on the given inputs" - ) - ; - - py::class_(m, "theta_jaccard_similarity") - .def_static( - "jaccard", - [](const theta_sketch& sketch_a, const theta_sketch& sketch_b, uint64_t seed) { - return theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed); - }, - py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED, - "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches" - ) - .def_static( - "exactly_equal", - &theta_jaccard_similarity::exactly_equal, - py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED, - "Returns True if sketch_a and sketch_b are equivalent, otherwise False" - ) - .def_static( - "similarity_test", - &theta_jaccard_similarity::similarity_test, - py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard " - "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered " - "to be similar with a confidence of 97.7% and returns True, otherwise False.") - .def_static( - "dissimilarity_test", - &theta_jaccard_similarity::dissimilarity_test, - py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard " - "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered " - "to be dissimilar with a confidence of 97.7% and returns True, otherwise False." - ) - ; -} diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp deleted file mode 100644 index 343181d2..00000000 --- a/python/src/tuple_wrapper.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include - -#include "theta_sketch.hpp" -#include "tuple_sketch.hpp" -#include "tuple_union.hpp" -#include "tuple_intersection.hpp" -#include "tuple_a_not_b.hpp" -#include "theta_jaccard_similarity_base.hpp" -#include "common_defs.hpp" - -#include "py_serde.hpp" -#include "tuple_policy.hpp" - -namespace py = pybind11; - -void init_tuple(py::module &m) { - using namespace datasketches; - - // generic tuple_policy: - // * update sketch policy uses create_summary and update_summary - // * set operation policies all use __call__ - py::class_>(m, "TuplePolicy") - .def(py::init()) - .def("create_summary", &tuple_policy::create_summary) - .def("update_summary", &tuple_policy::update_summary, py::arg("summary"), py::arg("update")) - .def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update")) - ; - - // potentially useful for debugging but not needed as a permanent - // object type in the library - /* - py::class_(m, "TuplePolicyHolder") - .def(py::init>(), py::arg("policy")) - .def("create", &tuple_policy_holder::create, "Creates a new Summary object") - .def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"), - "Updates the provided summary using the data in update") - ; - */ - - using py_tuple_sketch = tuple_sketch; - using py_update_tuple = update_tuple_sketch; - using py_compact_tuple = compact_tuple_sketch; - using py_tuple_union = tuple_union; - using py_tuple_intersection = tuple_intersection; - using py_tuple_a_not_b = tuple_a_not_b; - using py_tuple_jaccard_similarity = jaccard_similarity_base, tuple_intersection, pair_extract_key>; - - py::class_(m, "_tuple_sketch") - .def("__str__", &py_tuple_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &py_tuple_sketch::to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("is_empty", &py_tuple_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("get_estimate", &py_tuple_sketch::get_estimate, - "Estimate of the distinct count of the input stream") - .def("get_upper_bound", static_cast(&py_tuple_sketch::get_upper_bound), py::arg("num_std_devs"), - "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}") - .def("get_lower_bound", static_cast(&py_tuple_sketch::get_lower_bound), py::arg("num_std_devs"), - "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}") - .def("is_estimation_mode", &py_tuple_sketch::is_estimation_mode, - "Returns True if sketch is in estimation mode, otherwise False") - .def("get_theta", &py_tuple_sketch::get_theta, - "Returns theta (effective sampling rate) as a fraction from 0 to 1") - .def("get_theta64", &py_tuple_sketch::get_theta64, - "Returns theta as 64-bit value") - .def("get_num_retained", &py_tuple_sketch::get_num_retained, - "Returns the number of items currently in the sketch") - .def("get_seed_hash", [](const py_tuple_sketch& sk) { return sk.get_seed_hash(); }, // why does regular call not work?? - "Returns a hash of the seed used in the sketch") - .def("is_ordered", &py_tuple_sketch::is_ordered, - "Returns True if the sketch entries are sorted, otherwise False") - .def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); }) - .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; }); - ; - - py::class_(m, "_compact_tuple_sketch") - .def(py::init(), py::arg("other")) - .def(py::init(), py::arg("other"), py::arg("ordered")=true) - .def(py::init(), py::arg("other"), py::arg("summary"), - "Creates a compact tuple sketch from a theta sketch using a fixed summary value.") - .def( - "serialize", - [](const py_compact_tuple& sk, py_object_serde& serde) { - auto bytes = sk.serialize(0, serde); - return py::bytes(reinterpret_cast(bytes.data()), bytes.size()); - }, py::arg("serde"), - "Serializes the sketch into a bytes object" - ) - .def_static( - "deserialize", - [](const std::string& bytes, py_object_serde& serde, uint64_t seed) { - return py_compact_tuple::deserialize(bytes.data(), bytes.size(), seed, serde); - }, - py::arg("bytes"), py::arg("serde"), py::arg("seed")=DEFAULT_SEED, - "Reads a bytes object and returns the corresponding compact_tuple_sketch" - ); - - py::class_(m, "_update_tuple_sketch") - .def( - py::init([](std::shared_ptr policy, uint8_t lg_k, double p, uint64_t seed) { - tuple_policy_holder holder(policy); - return py_update_tuple::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build(); - }), - py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED - ) - .def(py::init()) - .def("update", static_cast(&py_update_tuple::update), - py::arg("datum"), py::arg("value"), - "Updates the sketch with the given integral item and summary value") - .def("update", static_cast(&py_update_tuple::update), - py::arg("datum"), py::arg("value"), - "Updates the sketch with the given floating point item and summary value") - .def("update", static_cast(&py_update_tuple::update), - py::arg("datum"), py::arg("value"), - "Updates the sketch with the given string item and summary value") - .def("compact", &py_update_tuple::compact, py::arg("ordered")=true, - "Returns a compacted form of the sketch, optionally sorting it") - .def("trim", &py_update_tuple::trim, "Removes retained entries in excess of the nominal size k (if any)") - .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state") - ; - - py::class_(m, "_tuple_union") - .def( - py::init([](std::shared_ptr policy, uint8_t lg_k, double p, uint64_t seed) { - tuple_policy_holder holder(policy); - return py_tuple_union::builder(holder).set_lg_k(lg_k).set_p(p).set_seed(seed).build(); - }), - py::arg("policy"), py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED - ) - .def("update", &py_tuple_union::update, py::arg("sketch"), - "Updates the union with the given sketch") - .def("get_result", &py_tuple_union::get_result, py::arg("ordered")=true, - "Returns the sketch corresponding to the union result") - .def("reset", &py_tuple_union::reset, - "Resets the sketch to the initial empty") - ; - - py::class_(m, "_tuple_intersection") - .def( - py::init([](std::shared_ptr policy, uint64_t seed) { - tuple_policy_holder holder(policy); - return py_tuple_intersection(seed, holder); - }), - py::arg("policy"), py::arg("seed")=DEFAULT_SEED) - .def("update", &py_tuple_intersection::update, py::arg("sketch"), - "Intersects the provided sketch with the current intersection state") - .def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true, - "Returns the sketch corresponding to the intersection result") - .def("has_result", &py_tuple_intersection::has_result, - "Returns True if the intersection has a valid result, otherwise False") - ; - - py::class_(m, "_tuple_a_not_b") - .def(py::init(), py::arg("seed")=DEFAULT_SEED) - .def( - "compute", - &py_tuple_a_not_b::compute, - py::arg("a"), py::arg("b"), py::arg("ordered")=true, - "Returns a sketch with the result of applying the A-not-B operation on the given inputs" - ) - ; - - py::class_(m, "_tuple_jaccard_similarity") - .def_static( - "jaccard", - [](const py_tuple_sketch& sketch_a, const py_tuple_sketch& sketch_b, uint64_t seed) { - return py_tuple_jaccard_similarity::jaccard(sketch_a, sketch_b, seed); - }, - py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED, - "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches" - ) - .def_static( - "exactly_equal", - &py_tuple_jaccard_similarity::exactly_equal, - py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED, - "Returns True if sketch_a and sketch_b are equivalent, otherwise False" - ) - .def_static( - "similarity_test", - &py_tuple_jaccard_similarity::similarity_test, - py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard " - "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered " - "to be similar with a confidence of 97.7% and returns True, otherwise False.") - .def_static( - "dissimilarity_test", - &py_tuple_jaccard_similarity::dissimilarity_test, - py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard " - "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered " - "to be dissimilar with a confidence of 97.7% and returns True, otherwise False." - ) - ; -} diff --git a/python/src/vector_of_kll.cpp b/python/src/vector_of_kll.cpp deleted file mode 100644 index 68e418c3..00000000 --- a/python/src/vector_of_kll.cpp +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "kll_sketch.hpp" - -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; - -namespace datasketches { - -namespace vector_of_kll_constants { - static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K; - static const uint32_t DEFAULT_D = 1; -} - -// Wrapper class for Numpy compatibility -template > -class vector_of_kll_sketches { - public: - explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D); - vector_of_kll_sketches(const vector_of_kll_sketches& other); - vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept; - vector_of_kll_sketches& operator=(const vector_of_kll_sketches& other); - vector_of_kll_sketches& operator=(vector_of_kll_sketches&& other); - - // container parameters - inline uint32_t get_k() const; - inline uint32_t get_d() const; - - // sketch updates/merges - void update(const py::array_t& items); - void merge(const vector_of_kll_sketches& other); - - // returns a single sketch combining all data in the array - kll_sketch collapse(const py::array_t& isk) const; - - // sketch queries returning an array of results - py::array is_empty() const; - py::array get_n() const; - py::array is_estimation_mode() const; - py::array get_min_values() const; - py::array get_max_values() const; - py::array get_num_retained() const; - py::array get_quantiles(const py::array_t& ranks, const py::array_t& isk) const; - py::array get_ranks(const py::array_t& values, const py::array_t& isk) const; - py::array get_pmf(const py::array_t& split_points, const py::array_t& isk) const; - py::array get_cdf(const py::array_t& split_points, const py::array_t& isk) const; - - // human-readable output - std::string to_string(bool print_levels = false, bool print_items = false) const; - - // binary output/input - py::list serialize(const py::array_t& isk); - // note: deserialize() replaces the sketch at the specified - // index. Not a static method. - void deserialize(const py::bytes& sk_bytes, uint32_t idx); - - private: - std::vector get_indices(const py::array_t& isk) const; - - const uint32_t k_; // kll sketch k parameter - const uint32_t d_; // number of dimensions (here: sketches) to hold - std::vector> sketches_; -}; - -template -vector_of_kll_sketches::vector_of_kll_sketches(uint32_t k, uint32_t d): -k_(k), -d_(d) -{ - // check d is valid (k is checked by kll_sketch) - if (d < 1) { - throw std::invalid_argument("D must be >= 1: " + std::to_string(d)); - } - - sketches_.reserve(d); - // spawn the sketches - for (uint32_t i = 0; i < d; i++) { - sketches_.emplace_back(k); - } -} - -template -vector_of_kll_sketches::vector_of_kll_sketches(const vector_of_kll_sketches& other) : - k_(other.k_), - d_(other.d_), - sketches_(other.sketches_) -{} - -template -vector_of_kll_sketches::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept : - k_(other.k_), - d_(other.d_), - sketches_(std::move(other.sketches_)) -{} - -template -vector_of_kll_sketches& vector_of_kll_sketches::operator=(const vector_of_kll_sketches& other) { - vector_of_kll_sketches copy(other); - k_ = copy.k_; - d_ = copy.d_; - std::swap(sketches_, copy.sketches_); - return *this; -} - -template -vector_of_kll_sketches& vector_of_kll_sketches::operator=(vector_of_kll_sketches&& other) { - k_ = other.k_; - d_ = other.d_; - std::swap(sketches_, other.sketches_); - return *this; -} - -template -uint32_t vector_of_kll_sketches::get_k() const { - return k_; -} - -template -uint32_t vector_of_kll_sketches::get_d() const { - return d_; -} - -template -std::vector vector_of_kll_sketches::get_indices(const py::array_t& isk) const { - std::vector indices; - if (isk.size() == 1) { - auto data = isk.unchecked(); - if (data(0) == -1) { - indices.reserve(d_); - for (uint32_t i = 0; i < d_; ++i) { - indices.push_back(i); - } - } else { - indices.push_back(static_cast(data(0))); - } - } else { - auto data = isk.unchecked<1>(); - indices.reserve(isk.size()); - for (uint32_t i = 0; i < isk.size(); ++i) { - const uint32_t idx = static_cast(data(i)); - if (idx < d_) { - indices.push_back(idx); - } else { - throw std::invalid_argument("request for invalid dimenions >= d (" - + std::to_string(d_) +"): "+ std::to_string(idx)); - } - } - } - return indices; -} - -// Checks if each sketch is empty or not -template -py::array vector_of_kll_sketches::is_empty() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].is_empty(); - } - - return py::cast(vals); -} - -// Updates each sketch with values -// Currently: all values must be present -// TODO: allow subsets of sketches to be updated -template -void vector_of_kll_sketches::update(const py::array_t& items) { - - size_t ndim = items.ndim(); - - if (items.shape(ndim-1) != d_) { - throw std::invalid_argument("input data must have rows with " + std::to_string(d_) - + " elements. Found: " + std::to_string(items.shape(ndim-1))); - } - - if (ndim == 1) { - // 1D case: single value to update per sketch - auto data = items.template unchecked<1>(); - for (uint32_t i = 0; i < d_; ++i) { - sketches_[i].update(data(i)); - } - } - else if (ndim == 2) { - // 2D case: multiple values to update per sketch - auto data = items.template unchecked<2>(); - if (items.flags() & py::array::f_style) { - for (uint32_t j = 0; j < d_; ++j) { - for (uint32_t i = 0; i < items.shape(0); ++i) { - sketches_[j].update(data(i,j)); - } - } - } else { // py::array::c_style or py::array::forcecast - for (uint32_t i = 0; i < items.shape(0); ++i) { - for (uint32_t j = 0; j < d_; ++j) { - sketches_[j].update(data(i,j)); - } - } - } - } - else { - throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim)); - } -} - -// Merges two arrays of sketches -// Currently: all values must be present -template -void vector_of_kll_sketches::merge(const vector_of_kll_sketches& other) { - if (d_ != other.get_d()) { - throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_) - + " vs " + std::to_string(other.d_)); - } else { - for (uint32_t i = 0; i < d_; ++i) { - sketches_[i].merge(other.sketches_[i]); - } - } -} - -template -kll_sketch vector_of_kll_sketches::collapse(const py::array_t& isk) const { - std::vector inds = get_indices(isk); - - kll_sketch result(k_); - for (auto& idx : inds) { - result.merge(sketches_[idx]); - } - return result; -} - -// Number of updates for each sketch -template -py::array vector_of_kll_sketches::get_n() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].get_n(); - } - return py::cast(vals); -} - -// Number of retained values for each sketch -template -py::array vector_of_kll_sketches::get_num_retained() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].get_num_retained(); - } - return py::cast(vals); -} - -// Gets the minimum value of each sketch -// TODO: allow subsets of sketches -template -py::array vector_of_kll_sketches::get_min_values() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].get_min_item(); - } - return py::cast(vals); -} - -// Gets the maximum value of each sketch -// TODO: allow subsets of sketches -template -py::array vector_of_kll_sketches::get_max_values() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].get_max_item(); - } - return py::cast(vals); -} - -// Summary of each sketch as one long string -// Users should use .split('\n\n') when calling it to build a list of each -// sketch's summary -template -std::string vector_of_kll_sketches::to_string(bool print_levels, bool print_items) const { - std::ostringstream ss; - for (uint32_t i = 0; i < d_; ++i) { - // all streams into 1 string, for compatibility with Python's str() behavior - // users will need to split by \n\n, e.g., str(kll).split('\n\n') - if (i > 0) ss << "\n"; - ss << sketches_[i].to_string(print_levels, print_items); - } - return ss.str(); -} - -template -py::array vector_of_kll_sketches::is_estimation_mode() const { - std::vector vals(d_); - for (uint32_t i = 0; i < d_; ++i) { - vals[i] = sketches_[i].is_estimation_mode(); - } - return py::cast(vals); -} - -// Value of sketch(es) corresponding to some quantile(s) -template -py::array vector_of_kll_sketches::get_quantiles(const py::array_t& ranks, - const py::array_t& isk) const { - std::vector inds = get_indices(isk); - size_t num_sketches = inds.size(); - size_t num_quantiles = ranks.size(); - - std::vector> quants(num_sketches, std::vector(num_quantiles)); - for (uint32_t i = 0; i < num_sketches; ++i) { - for (size_t j = 0; j < num_quantiles; ++j) { - quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]); - } - } - - return py::cast(quants); -} - -// Value of sketch(es) corresponding to some rank(s) -template -py::array vector_of_kll_sketches::get_ranks(const py::array_t& values, - const py::array_t& isk) const { - std::vector inds = get_indices(isk); - size_t num_sketches = inds.size(); - size_t num_ranks = values.size(); - auto vals = values.data(); - - std::vector> ranks(num_sketches, std::vector(num_ranks)); - for (uint32_t i = 0; i < num_sketches; ++i) { - for (size_t j = 0; j < num_ranks; ++j) { - ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]); - } - } - - return py::cast(ranks); -} - -// PMF(s) of sketch(es) -template -py::array vector_of_kll_sketches::get_pmf(const py::array_t& split_points, - const py::array_t& isk) const { - std::vector inds = get_indices(isk); - size_t num_sketches = inds.size(); - size_t num_splits = split_points.size(); - - std::vector> pmfs(num_sketches, std::vector(num_splits + 1)); - for (uint32_t i = 0; i < num_sketches; ++i) { - auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits); - for (size_t j = 0; j <= num_splits; ++j) { - pmfs[i][j] = pmf[j]; - } - } - - return py::cast(pmfs); -} - -// CDF(s) of sketch(es) -template -py::array vector_of_kll_sketches::get_cdf(const py::array_t& split_points, - const py::array_t& isk) const { - std::vector inds = get_indices(isk); - size_t num_sketches = inds.size(); - size_t num_splits = split_points.size(); - - std::vector> cdfs(num_sketches, std::vector(num_splits + 1)); - for (uint32_t i = 0; i < num_sketches; ++i) { - auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits); - for (size_t j = 0; j <= num_splits; ++j) { - cdfs[i][j] = cdf[j]; - } - } - - return py::cast(cdfs); -} - -template -void vector_of_kll_sketches::deserialize(const py::bytes& sk_bytes, - uint32_t idx) { - if (idx >= d_) { - throw std::invalid_argument("request for invalid dimenions >= d (" - + std::to_string(d_) +"): "+ std::to_string(idx)); - } - std::string skStr = sk_bytes; // implicit cast - // load the sketch into the proper index - sketches_[idx] = std::move(kll_sketch::deserialize(skStr.c_str(), skStr.length())); -} - -template -py::list vector_of_kll_sketches::serialize(const py::array_t& isk) { - std::vector inds = get_indices(isk); - const size_t num_sketches = inds.size(); - - py::list list(num_sketches); - for (uint32_t i = 0; i < num_sketches; ++i) { - auto serResult = sketches_[inds[i]].serialize(); - list[i] = py::bytes((char*)serResult.data(), serResult.size()); - } - - return list; -} - -namespace python { -template -double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) { - return kll_sketch::get_normalized_rank_error(k, pmf); -} - -} // namespace datasketches::python - -} // namespace datasketches - -namespace dspy = datasketches::python; - -template -void bind_vector_of_kll_sketches(py::module &m, const char* name) { - using namespace datasketches; - - py::class_>(m, name) - .def(py::init(), py::arg("k")=vector_of_kll_constants::DEFAULT_K, - py::arg("d")=vector_of_kll_constants::DEFAULT_D) - .def(py::init&>()) - // allow user to retrieve k or d, in case it's instantiated w/ defaults - .def("get_k", &vector_of_kll_sketches::get_k, - "Returns the value of `k` of the sketch(es)") - .def("get_d", &vector_of_kll_sketches::get_d, - "Returns the number of sketches") - .def("update", &vector_of_kll_sketches::update, py::arg("items"), - "Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan") - .def("__str__", &vector_of_kll_sketches::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, - "Produces a string summary of all sketches. Users should split the returned string by '\n\n'") - .def("to_string", &vector_of_kll_sketches::to_string, py::arg("print_levels")=false, - py::arg("print_items")=false, - "Produces a string summary of all sketches. Users should split the returned string by '\n\n'") - .def("is_empty", &vector_of_kll_sketches::is_empty, - "Returns whether the sketch(es) is(are) empty of not") - .def("get_n", &vector_of_kll_sketches::get_n, - "Returns the number of values seen by the sketch(es)") - .def("get_num_retained", &vector_of_kll_sketches::get_num_retained, - "Returns the number of values retained by the sketch(es)") - .def("is_estimation_mode", &vector_of_kll_sketches::is_estimation_mode, - "Returns whether the sketch(es) is(are) in estimation mode") - .def("get_min_values", &vector_of_kll_sketches::get_min_values, - "Returns the minimum value(s) of the sketch(es)") - .def("get_max_values", &vector_of_kll_sketches::get_max_values, - "Returns the maximum value(s) of the sketch(es)") - .def("get_quantiles", &vector_of_kll_sketches::get_quantiles, py::arg("ranks"), - py::arg("isk")=-1, - "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)") - .def("get_ranks", &vector_of_kll_sketches::get_ranks, py::arg("values"), - py::arg("isk")=-1, - "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)") - .def("get_pmf", &vector_of_kll_sketches::get_pmf, py::arg("split_points"), py::arg("isk")=-1, - "Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)") - .def("get_cdf", &vector_of_kll_sketches::get_cdf, py::arg("split_points"), py::arg("isk")=-1, - "Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)") - .def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error, - py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error") - .def("serialize", &vector_of_kll_sketches::serialize, py::arg("isk")=-1, - "Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)") - .def("deserialize", &vector_of_kll_sketches::deserialize, py::arg("skBytes"), py::arg("isk"), - "Deserializes the specified sketch. `isk` must be an int.") - .def("merge", &vector_of_kll_sketches::merge, py::arg("array_of_sketches"), - "Merges the input array of KLL sketches into the existing array.") - .def("collapse", &vector_of_kll_sketches::collapse, py::arg("isk")=-1, - "Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)") - ; -} - -void init_vector_of_kll(py::module &m) { - bind_vector_of_kll_sketches(m, "vector_of_kll_ints_sketches"); - bind_vector_of_kll_sketches(m, "vector_of_kll_floats_sketches"); -} diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp deleted file mode 100644 index 732810b8..00000000 --- a/python/src/vo_wrapper.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "var_opt_sketch.hpp" -#include "var_opt_union.hpp" -#include "py_serde.hpp" - -#include - -namespace py = pybind11; - -namespace datasketches { - -namespace python { - -template -var_opt_sketch vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) { - std::string skStr = skBytes; // implicit cast - return var_opt_sketch::deserialize(skStr.c_str(), skStr.length(), sd); -} - -template -py::object vo_sketch_serialize(const var_opt_sketch& sk, py_object_serde& sd) { - auto serResult = sk.serialize(0, sd); - return py::bytes((char*)serResult.data(), serResult.size()); -} - -template -size_t vo_sketch_size_bytes(const var_opt_sketch& sk, py_object_serde& sd) { - return sk.get_serialized_size_bytes(sd); -} - -template -var_opt_union vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) { - std::string uStr = uBytes; // implicit cast - return var_opt_union::deserialize(uStr.c_str(), uStr.length(), sd); -} - -template -py::object vo_union_serialize(const var_opt_union& u, py_object_serde& sd) { - auto serResult = u.serialize(0, sd); - return py::bytes((char*)serResult.data(), serResult.size()); -} - -template -size_t vo_union_size_bytes(const var_opt_union& u, py_object_serde& sd) { - return u.get_serialized_size_bytes(sd); -} - -template -py::list vo_sketch_get_samples(const var_opt_sketch& sk) { - py::list list; - for (auto item : sk) { - py::tuple t = py::make_tuple(item.first, item.second); - list.append(t); - } - return list; -} - -template -py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch& sk, const std::function func) { - subset_summary summary = sk.estimate_subset_sum(func); - py::dict d; - d["estimate"] = summary.estimate; - d["lower_bound"] = summary.lower_bound; - d["upper_bound"] = summary.upper_bound; - d["total_sketch_weight"] = summary.total_sketch_weight; - return d; -} - -template -std::string vo_sketch_to_string(const var_opt_sketch& sk, bool print_items) { - if (print_items) { - std::ostringstream ss; - ss << sk.to_string(); - ss << "### VarOpt Sketch Items" << std::endl; - int i = 0; - for (auto item : sk) { - // item.second is always a double - // item.first is an arbitrary py::object, so get the value by - // using internal str() method then casting to C++ std::string - py::str item_pystr(item.first); - std::string item_str = py::cast(item_pystr); - ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl; - } - return ss.str(); - } else { - return sk.to_string(); - } -} - -} -} - -namespace dspy = datasketches::python; - -template -void bind_vo_sketch(py::module &m, const char* name) { - using namespace datasketches; - - py::class_>(m, name) - .def(py::init(), py::arg("k")) - .def("__str__", &dspy::vo_sketch_to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("to_string", &dspy::vo_sketch_to_string, py::arg("print_items")=false, - "Produces a string summary of the sketch") - .def("update", (void (var_opt_sketch::*)(const T&, double)) &var_opt_sketch::update, py::arg("item"), py::arg("weight")=1.0, - "Updates the sketch with the given value and weight") - .def_property_readonly("k", &var_opt_sketch::get_k, - "Returns the sketch's maximum configured sample size") - .def_property_readonly("n", &var_opt_sketch::get_n, - "Returns the total stream length") - .def_property_readonly("num_samples", &var_opt_sketch::get_num_samples, - "Returns the number of samples currently in the sketch") - .def("get_samples", &dspy::vo_sketch_get_samples, - "Returns the set of samples in the sketch") - .def("is_empty", &var_opt_sketch::is_empty, - "Returns True if the sketch is empty, otherwise False") - .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum, - "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well " - "as upper and lower bounds on the estimate and the total weight processed by the sketch") - .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes, py::arg("serde"), - "Computes the size in bytes needed to serialize the current sketch") - .def("serialize", &dspy::vo_sketch_serialize, py::arg("serde"), "Serialize the var opt sketch using the provided serde") - .def_static("deserialize", &dspy::vo_sketch_deserialize, py::arg("bytes"), py::arg("serde"), - "Constructs a var opt sketch from the given bytes using the provided serde") - .def("__iter__", [](const var_opt_sketch& sk) { return py::make_iterator(sk.begin(), sk.end()); }); -} - -template -void bind_vo_union(py::module &m, const char* name) { - using namespace datasketches; - - py::class_>(m, name) - .def(py::init(), py::arg("max_k")) - .def("__str__", &var_opt_union::to_string, - "Produces a string summary of the sketch") - .def("to_string", &var_opt_union::to_string, - "Produces a string summary of the sketch") - .def("update", (void (var_opt_union::*)(const var_opt_sketch& sk)) &var_opt_union::update, py::arg("sketch"), - "Updates the union with the given sketch") - .def("get_result", &var_opt_union::get_result, - "Returns a sketch corresponding to the union result") - .def("reset", &var_opt_union::reset, - "Resets the union to the empty state") - .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes, py::arg("serde"), - "Computes the size in bytes needed to serialize the current sketch") - .def("serialize", &dspy::vo_union_serialize, py::arg("serde"), "Serialize the var opt union using the provided serde") - .def_static("deserialize", &dspy::vo_union_deserialize, py::arg("bytes"), py::arg("serde"), - "Constructs a var opt union from the given bytes using the provided serde") - ; -} - -void init_vo(py::module &m) { - bind_vo_sketch(m, "var_opt_sketch"); - bind_vo_union(m, "var_opt_union"); -} diff --git a/python/tests/__init__.py b/python/tests/__init__.py deleted file mode 100644 index 13a83393..00000000 --- a/python/tests/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/python/tests/count_min_test.py b/python/tests/count_min_test.py deleted file mode 100644 index a14c0c10..00000000 --- a/python/tests/count_min_test.py +++ /dev/null @@ -1,86 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import count_min_sketch - -class CountMinTest(unittest.TestCase): - def test_count_min_example(self): - # we'll define target confidence and relative error and use the built-in - # methods to determine how many hashes and buckets to use - confidence = 0.95 - num_hashes = count_min_sketch.suggest_num_hashes(confidence) - relative_error = 0.01 - num_buckets = count_min_sketch.suggest_num_buckets(relative_error) - - # now we can create a few empty sketches - cm = count_min_sketch(num_hashes, num_buckets) - cm2 = count_min_sketch(num_hashes, num_buckets) - self.assertTrue(cm.is_empty()) - - # we'll use a moderate number of distinct items with - # increasing weights, with each item's weight being - # equal to its value - n = 1000 - total_wt = 0 - for i in range(1, n+1): - cm.update(i, i) - total_wt += i - self.assertFalse(cm.is_empty()) - self.assertEqual(cm.get_total_weight(), total_wt) - - # querying the items, each of them should - # have a non-zero count. the estimate should - # be at least i with appropriately behaved bounds. - for i in range(1, n+1): - val = cm.get_estimate(i) - self.assertGreaterEqual(val, i) - self.assertGreaterEqual(val, cm.get_lower_bound(i)) - self.assertGreater(cm.get_upper_bound(i), val) - - # values not in the sketch should have lower estimates, but - # are not guaranteed to be zero and will succeed - self.assertIsNotNone(cm.get_estimate("not in set")) - - # we can create another sketch with partial overlap - # and merge them - for i in range(int(n / 2), int(3 * n / 2)): - cm2.update(i, i) - cm.merge(cm2) - - # and the estimated weight for the overlapped meerged values - # (n/2 to n) should now be at least 2x the value - self.assertGreaterEqual(cm.get_estimate(n), 2 * n) - - # finally, serialize and reconstruct - cm_bytes = cm.serialize() - self.assertEqual(cm.get_serialized_size_bytes(), len(cm_bytes)) - new_cm = count_min_sketch.deserialize(cm_bytes) - - # and now interrogate the sketch - self.assertFalse(new_cm.is_empty()) - self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes()) - self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets()) - self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight()) - - # we can also iterate through values in and out of the sketch to ensure - # the estimates match - for i in range(0, 2 * n): - self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i)) - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/cpc_test.py b/python/tests/cpc_test.py deleted file mode 100644 index 9f20670d..00000000 --- a/python/tests/cpc_test.py +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import cpc_sketch, cpc_union - -class CpcTest(unittest.TestCase): - def test_cpc_example(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k distinct values - - # create a couple sketches and inject some values - # we'll have 1/4 of the values overlap - cpc = cpc_sketch(lgk) - cpc2 = cpc_sketch(lgk) - offset = int(3 * n / 4) # it's a float w/o cast - # because we hash on the bits, not an abstract numeric value, - # cpc.update(1) and cpc.update(1.0) give different results. - for i in range(0, n): - cpc.update(i) - cpc2.update(i + offset) - - # although we provide get_composite_estimate() and get_estimate(), - # the latter will always give the best available estimate. we - # recommend using get_estimate(). - # we can check that the upper and lower bounds bracket the - # estimate, without needing to know the exact value. - self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate()) - self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate()) - - # union is a separate class, so we need to get_result() - # to query the unioned sketches - union = cpc_union(lgk) - union.update(cpc) - union.update(cpc2) - result = union.get_result() - - # since our process here (including post-union CPC) is - # deterministic, we have checked and know the exact - # answer is within one standard deviation of the estimate - self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4) - - # serialize for storage and reconstruct - sk_bytes = result.serialize() - new_cpc = cpc_sketch.deserialize(sk_bytes) - self.assertFalse(new_cpc.is_empty()) - - def test_cpc_get_lg_k(self): - lgk = 10 - cpc = cpc_sketch(lgk) - self.assertEqual(cpc.get_lg_k(), lgk) - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/density_test.py b/python/tests/density_test.py deleted file mode 100644 index 4d8a2763..00000000 --- a/python/tests/density_test.py +++ /dev/null @@ -1,93 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import density_sketch, KernelFunction -import numpy as np - -class UnitSphereKernel(KernelFunction): - def __call__(self, a: np.array, b: np.array) -> float: - if np.linalg.norm(a - b) < 1.0: - return 1.0 - else: - return 0.0 - -class densityTest(unittest.TestCase): - def test_density_sketch(self): - k = 10 - dim = 3 - n = 1000 - - sketch = density_sketch(k, dim) - - self.assertEqual(sketch.get_k(), k) - self.assertEqual(sketch.get_dim(), dim) - self.assertTrue(sketch.is_empty()) - self.assertFalse(sketch.is_estimation_mode()) - self.assertEqual(sketch.get_n(), 0) - self.assertEqual(sketch.get_num_retained(), 0) - - for i in range(n): - sketch.update([i, i, i]) - - self.assertFalse(sketch.is_empty()) - self.assertTrue(sketch.is_estimation_mode()) - self.assertEqual(sketch.get_n(), n) - self.assertGreater(sketch.get_num_retained(), k) - self.assertLess(sketch.get_num_retained(), n) - self.assertGreater(sketch.get_estimate([n - 1, n - 1, n - 1]), 0) - - for tuple in sketch: - vector = tuple[0] - weight = tuple[1] - self.assertEqual(len(vector), dim) - self.assertGreaterEqual(weight, 1) - - sk_bytes = sketch.serialize() - sketch2 = density_sketch.deserialize(sk_bytes) - self.assertEqual(sketch.get_estimate([1.5, 2.5, 3.5]), sketch2.get_estimate([1.5, 2.5, 3.5])) - - def test_density_merge(self): - sketch1 = density_sketch(10, 2) - sketch1.update([0, 0]) - sketch2 = density_sketch(10, 2) - sketch2.update([0, 1]) - sketch1.merge(sketch2) - self.assertEqual(sketch1.get_n(), 2) - self.assertEqual(sketch1.get_num_retained(), 2) - - def test_custom_kernel(self): - gaussianSketch = density_sketch(10, 2) # default kernel - sphericalSketch = density_sketch(10, 2, UnitSphereKernel()) - - p = [1, 1] - gaussianSketch.update(p) - sphericalSketch.update(p) - - # Spherical kernel should return 1.0 for a nearby point, 0 farther - # Gaussian kernel should return something nonzero when farther away - self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), 1.0) - self.assertEqual(sphericalSketch.get_estimate([2, 2]), 0.0) - self.assertGreater(gaussianSketch.get_estimate([2, 2]), 0.0) - - # We can also use a custom kernel when deserializing - sk_bytes = sphericalSketch.serialize() - sphericalRebuilt = density_sketch.deserialize(sk_bytes, UnitSphereKernel()) - self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), sphericalRebuilt.get_estimate([1.001, 1])) - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/fi_test.py b/python/tests/fi_test.py deleted file mode 100644 index b5ca71a7..00000000 --- a/python/tests/fi_test.py +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import frequent_strings_sketch, frequent_items_sketch -from datasketches import frequent_items_error_type, PyIntsSerDe - -class FiTest(unittest.TestCase): - def test_fi_strings_example(self): - k = 3 # a small value so we can easily fill the sketch - fi = frequent_strings_sketch(k) - - # we'll use a small number of distinct items so we - # can use exponentially increasing weights and have - # some frequent items, decreasing so we have some - # small items inserted after a purge - n = 8 - for i in range(0, n): - fi.update(str(i), 2 ** (n - i)) - - # there are two ways to extract items : - # * NO_FALSE_POSITIVES includes all items with a lower bound - # above the a posteriori error - # * NO_FALSE_NEGATIVES includes all items with an uper bound - # above the a posteriori error - # a more complete discussion may be found at - # https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html - items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES) - items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES) - self.assertLessEqual(len(items_no_fp), len(items_no_fn)) - - # the items list returns a decreasing weight-sorted list, and - # for each item we have (item, estimate, lower_bound, upper_bound) - item = items_no_fp[1] - self.assertLessEqual(item[2], item[1]) # lower bound vs estimate - self.assertLessEqual(item[1], item[3]) # estimate vs upper bound - - # we can also query directly for a specific item - id = items_no_fn[0][0] - est = fi.get_estimate(id) - lb = fi.get_lower_bound(id) - ub = fi.get_upper_bound(id) - self.assertLessEqual(lb, est) - self.assertLessEqual(est, ub) - - # the values are zero if the item isn't in our list - self.assertEqual(fi.get_estimate("NaN"), 0) - - # now create a second sketch with a lot of unique - # values but all with equal weight (of 1) such that - # the total weight is much larger than the first sketch - fi2 = frequent_strings_sketch(k) - wt = fi.get_total_weight() - for i in range(0, 4*wt): - fi2.update(str(i)) - - # merge the second sketch into the first - fi.merge(fi2) - - # we can see that the weight is much larger - self.assertEqual(5 * wt, fi.get_total_weight()) - - # querying with NO_FALSE_POSITIVES means we don't find anything - # heavy enough to return - items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES) - self.assertEqual(len(items_no_fp), 0) - - # we do, however, find a few potential heavy items - # if querying with NO_FALSE_NEGATIVES - items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES) - self.assertGreater(len(items_no_fn), 0) - - # finally, serialize and reconstruct - fi_bytes = fi.serialize() - self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes()) - new_fi = frequent_strings_sketch.deserialize(fi_bytes) - - # and now interrogate the sketch - self.assertFalse(new_fi.is_empty()) - self.assertGreater(new_fi.get_num_active_items(), 0) - self.assertEqual(5 * wt, new_fi.get_total_weight()) - - # This example uses generic objects but is otherwise identical - def test_fi_items_example(self): - k = 3 # a small value so we can easily fill the sketch - fi = frequent_items_sketch(k) - - # as above, but in this case inserting ints - n = 8 - for i in range(0, n): - fi.update(i, 2 ** (n - i)) - - # everything else works identically, so let's jump straight - # to merging and serialization - - # now create a second sketch with a lot of unique - # values but all with equal weight (of 1) such that - # the total weight is much larger than the first sketch - fi2 = frequent_items_sketch(k) - wt = fi.get_total_weight() - for i in range(0, 4*wt): - fi2.update(i) - - # merge the second sketch into the first - fi.merge(fi2) - - # we can see that the weight is much larger - self.assertEqual(5 * wt, fi.get_total_weight()) - - # finally, serialize and reconstruct -- now we need a serde to tell - # (de)serialization how to interpret the objects - fi_bytes = fi.serialize(PyIntsSerDe()) - self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes(PyIntsSerDe())) - new_fi = frequent_items_sketch.deserialize(fi_bytes, PyIntsSerDe()) - - # and again interrogate the sketch to check that it's what we serialized - self.assertFalse(new_fi.is_empty()) - self.assertGreater(new_fi.get_num_active_items(), 0) - self.assertEqual(5 * wt, new_fi.get_total_weight()) - - - def test_fi_sketch(self): - # only testing a few things not used in the above example - k = 12 - wt = 10000 - fi = frequent_strings_sketch(k) - - self.assertAlmostEqual(fi.get_sketch_epsilon(), 0.0008545, delta=1e-6) - - sk_apriori_error = fi.get_sketch_epsilon() * wt - reference_apriori_error = frequent_strings_sketch.get_apriori_error(k, wt) - self.assertAlmostEqual(sk_apriori_error, reference_apriori_error, delta=1e-6) - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/hll_test.py b/python/tests/hll_test.py deleted file mode 100644 index 32d762ee..00000000 --- a/python/tests/hll_test.py +++ /dev/null @@ -1,129 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import hll_sketch, hll_union, tgt_hll_type - -class HllTest(unittest.TestCase): - def test_hll_example(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k unique values - - # create a couple sketches and inject some values - # we'll have 1/4 of the values overlap - hll = hll_sketch(lgk, tgt_hll_type.HLL_8) - hll2 = hll_sketch(lgk, tgt_hll_type.HLL_6) - offset = int(3 * n / 4) # it's a float w/o cast - # because we hash on the bits, not an abstract numeric value, - # hll.update(1) and hll.update(1.0) give different results. - for i in range(0, n): - hll.update(i) - hll2.update(i + offset) - - # we can check that the upper and lower bounds bracket the - # estimate, without needing to know the exact value. - self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate()) - self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate()) - - # union is a separate class, and we can either get a result - # sketch or query the union object directly - union = hll_union(lgk) - union.update(hll) - union.update(hll2) - result = union.get_result() - self.assertEqual(result.get_estimate(), union.get_estimate()) - - # since our process here (including post-union HLL) is - # deterministic, we have checked and know the exact - # answer is within one standard deviation of the estimate - self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4) - self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4) - - # serialize for storage and reconstruct - sk_bytes = result.serialize_compact() - self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes()) - new_hll = hll_sketch.deserialize(sk_bytes) - - # the sketch can self-report its configuration and status - self.assertEqual(new_hll.lg_config_k, lgk) - self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4) - self.assertFalse(new_hll.is_empty()) - - # if we want to reduce some object overhead, we can also reset - new_hll.reset() - self.assertTrue(new_hll.is_empty()) - - def test_hll_sketch(self): - lgk = 8 - n = 117 - hll = self.generate_sketch(n, lgk, tgt_hll_type.HLL_6) - hll.update('string data') - hll.update(3.14159) # double data - - self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate()) - self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate()) - - self.assertEqual(hll.lg_config_k, lgk) - self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6) - - bytes_compact = hll.serialize_compact() - bytes_update = hll.serialize_updatable() - self.assertEqual(len(bytes_compact), hll.get_compact_serialization_bytes()) - self.assertEqual(len(bytes_update), hll.get_updatable_serialization_bytes()) - - self.assertFalse(hll.is_compact()) - self.assertFalse(hll.is_empty()) - - self.assertTrue(isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch)) - self.assertTrue(isinstance(hll_sketch.deserialize(bytes_update), hll_sketch)) - - self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1)) - self.assertIsNotNone(hll_sketch.get_max_updatable_serialization_bytes(20, tgt_hll_type.HLL_6)) - - hll.reset() - self.assertTrue(hll.is_empty()) - - def test_hll_union(self): - lgk = 7 - n = 53 - union = hll_union(lgk) - - sk = self.generate_sketch(n, lgk, tgt_hll_type.HLL_4, 0) - union.update(sk) - sk = self.generate_sketch(3 * n, lgk, tgt_hll_type.HLL_4, n) - union.update(sk) - union.update('string data') - union.update(1.4142136) - - self.assertLessEqual(union.get_lower_bound(1), union.get_estimate()) - self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate()) - - self.assertEqual(union.lg_config_k, lgk) - self.assertFalse(union.is_empty()) - - sk = union.get_result() - self.assertTrue(isinstance(sk, hll_sketch)) - self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4) - - def generate_sketch(self, n, lgk, sk_type=tgt_hll_type.HLL_4, st_idx=0): - sk = hll_sketch(lgk, sk_type) - for i in range(st_idx, st_idx + n): - sk.update(i) - return sk - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/kll_test.py b/python/tests/kll_test.py deleted file mode 100644 index e79823ad..00000000 --- a/python/tests/kll_test.py +++ /dev/null @@ -1,159 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch -from datasketches import kll_items_sketch, ks_test, PyStringsSerDe -import numpy as np - -class KllTest(unittest.TestCase): - def test_kll_floats_example(self): - k = 160 - n = 2 ** 20 - - # create a sketch and inject ~1 million N(0,1) points as an array and as a single item - kll = kll_floats_sketch(k) - kll.update(np.random.normal(size=n-1)) - kll.update(0.0) - - # 0 should be near the median - self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.035) - - # the median should be near 0 - self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.035) - - # we also track the min/max independently from the rest of the data - # which lets us know the full observed data range - self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01)) - self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value())) - self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99)) - self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value())) - - # we can also extract a list of values at a time, - # here the values should give us something close to [-2, -1, 0, 1, 2]. - # then get the CDF, which will return something close to - # the original values used in get_quantiles() - # finally, can check the normalized rank error bound - pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) - cdf = kll.get_cdf(pts) # include 1.0 at end to account for all probability mass - self.assertEqual(len(cdf), len(pts)+1) - err = kll.normalized_rank_error(False) - self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False)) - - # and a few basic queries about the sketch - self.assertFalse(kll.is_empty()) - self.assertTrue(kll.is_estimation_mode()) - self.assertEqual(kll.get_n(), n) - self.assertEqual(kll.get_k(), k) - self.assertLess(kll.get_num_retained(), n) - - # merging itself will double the number of items the sketch has seen - # but need to do that with a copy - kll_copy = kll_floats_sketch(kll) - kll.merge(kll_copy) - self.assertEqual(kll.get_n(), 2*n) - - # we can then serialize and reconstruct the sketch - kll_bytes = kll.serialize() - new_kll = kll_floats_sketch.deserialize(kll_bytes) - self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained()) - self.assertEqual(kll.get_min_value(), new_kll.get_min_value()) - self.assertEqual(kll.get_max_value(), new_kll.get_max_value()) - self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7)) - self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0)) - - # A Kolmogorov-Smirnov Test of kll and new_kll should match, even for - # a fairly small p-value -- cannot reject the null hypothesis that - # they come from the same distribution (since they do) - self.assertFalse(ks_test(kll, new_kll, 0.001)) - - total_weight = 0 - for tuple in kll: - item = tuple[0] - weight = tuple[1] - total_weight = total_weight + weight - self.assertEqual(total_weight, kll.get_n()) - - def test_kll_ints_sketch(self): - k = 100 - n = 10 - kll = kll_ints_sketch(k) - for i in range(0, n): - kll.update(i) - - self.assertEqual(kll.get_min_value(), 0) - self.assertEqual(kll.get_max_value(), n-1) - self.assertEqual(kll.get_n(), n) - self.assertFalse(kll.is_empty()) - self.assertFalse(kll.is_estimation_mode()) # n < k - self.assertEqual(kll.get_k(), k) - - pmf = kll.get_pmf([round(n/2)]) - self.assertIsNotNone(pmf) - self.assertEqual(len(pmf), 2) - - cdf = kll.get_cdf([round(n/2)]) - self.assertIsNotNone(cdf) - self.assertEqual(len(cdf), 2) - - self.assertEqual(kll.get_quantile(0.5), round(n/2)) - quants = kll.get_quantiles([0.25, 0.5, 0.75]) - self.assertIsNotNone(quants) - self.assertEqual(len(quants), 3) - - self.assertEqual(kll.get_rank(round(n/2)), 0.5) - - # merge copy of self - kll_copy = kll_ints_sketch(kll) - kll.merge(kll_copy) - self.assertEqual(kll.get_n(), 2 * n) - - sk_bytes = kll.serialize() - self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch)) - - def test_kll_doubles_sketch(self): - # already tested float and ints and it's templatized, so just make sure it instantiates properly - k = 75 - kll = kll_doubles_sketch(k) - self.assertTrue(kll.is_empty()) - - def test_kll_items_sketch(self): - # most functionality has been tested, but we need to ensure objects and sorting work - # as well as serialization - k = 100 - n = 2 ** 16 - - # create a sketch and inject enough points to force compaction - kll = kll_items_sketch(k) - for i in range(0, n): - kll.update(str(i)) - - kll_copy = kll_items_sketch(kll) - kll.merge(kll_copy) - self.assertEqual(kll.get_n(), 2 * n) - - kll_bytes = kll.serialize(PyStringsSerDe()) - new_kll = kll_items_sketch.deserialize(kll_bytes, PyStringsSerDe()) - self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained()) - self.assertEqual(kll.get_min_value(), new_kll.get_min_value()) - self.assertEqual(kll.get_max_value(), new_kll.get_max_value()) - self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7)) - self.assertEqual(kll.get_rank(str(n/4)), new_kll.get_rank(str(n/4))) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/quantiles_test.py b/python/tests/quantiles_test.py deleted file mode 100644 index ce2148f1..00000000 --- a/python/tests/quantiles_test.py +++ /dev/null @@ -1,160 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch -from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe -import numpy as np - -class QuantilesTest(unittest.TestCase): - def test_quantiles_floats_example(self): - k = 128 - n = 2 ** 20 - - # create a sketch and inject ~1 million N(0,1) points as an array and as a single item - quantiles = quantiles_floats_sketch(k) - quantiles.update(np.random.normal(size=n-1)) - quantiles.update(0.0) - - # 0 should be near the median - self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035) - - # the median should be near 0 - self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035) - - # we also track the min/max independently from the rest of the data - # which lets us know the full observed data range - self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01)) - self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value())) - self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99)) - self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value())) - - # we can also extract a list of values at a time, - # here the values should give us something close to [-2, -1, 0, 1, 2]. - # then get the CDF, which will return something close to - # the original values used in get_quantiles() - # finally, can check the normalized rank error bound - pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) - cdf = quantiles.get_cdf(pts) # include 1.0 at end to account for all probability mass - self.assertEqual(len(cdf), len(pts)+1) - err = quantiles.normalized_rank_error(False) - self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False)) - - # and a few basic queries about the sketch - self.assertFalse(quantiles.is_empty()) - self.assertTrue(quantiles.is_estimation_mode()) - self.assertEqual(quantiles.get_n(), n) - self.assertEqual(quantiles.get_k(), k) - self.assertLess(quantiles.get_num_retained(), n) - - # merging itself will double the number of items the sketch has seen - quantiles_copy = quantiles_floats_sketch(quantiles) - quantiles.merge(quantiles_copy) - self.assertEqual(quantiles.get_n(), 2*n) - - # we can then serialize and reconstruct the sketch - quantiles_bytes = quantiles.serialize() - new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes) - self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained()) - self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value()) - self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value()) - self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7)) - self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0)) - - # If we create a new sketch with a very different distribution, a Kolmogorov-Smirnov Test - # of the two should return True: we can reject the null hypothesis that the sketches - # come from the same distributions. - unif_quantiles = quantiles_floats_sketch(k) - unif_quantiles.update(np.random.uniform(10, 20, size=n-1)) - self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001)) - - total_weight = 0 - for tuple in quantiles: - item = tuple[0] - weight = tuple[1] - total_weight = total_weight + weight - self.assertEqual(total_weight, quantiles.get_n()) - - def test_quantiles_ints_sketch(self): - k = 128 - n = 10 - quantiles = quantiles_ints_sketch(k) - for i in range(0, n): - quantiles.update(i) - - self.assertEqual(quantiles.get_min_value(), 0) - self.assertEqual(quantiles.get_max_value(), n-1) - self.assertEqual(quantiles.get_n(), n) - self.assertFalse(quantiles.is_empty()) - self.assertFalse(quantiles.is_estimation_mode()) # n < k - self.assertEqual(quantiles.get_k(), k) - - pmf = quantiles.get_pmf([round(n/2)]) - self.assertIsNotNone(pmf) - self.assertEqual(len(pmf), 2) - - cdf = quantiles.get_cdf([round(n/2)]) - self.assertIsNotNone(cdf) - self.assertEqual(len(cdf), 2) - - self.assertEqual(quantiles.get_quantile(0.5), round(n/2)) - quants = quantiles.get_quantiles([0.25, 0.5, 0.75]) - self.assertIsNotNone(quants) - self.assertEqual(len(quants), 3) - - self.assertEqual(quantiles.get_rank(round(n/2)), 0.5) - - # merge self - quantiles_copy = quantiles_ints_sketch(quantiles) - quantiles.merge(quantiles_copy) - self.assertEqual(quantiles.get_n(), 2 * n) - - sk_bytes = quantiles.serialize() - self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch)) - - def test_quantiles_doubles_sketch(self): - # already tested floats and ints and it's templatized, so just make sure it instantiates properly - k = 128 - quantiles = quantiles_doubles_sketch(k) - self.assertTrue(quantiles.is_empty()) - - def test_quantiles_items_sketch(self): - # most functionality has been tested, but we need to ensure objects and sorting work - # as well as serialization - k = 128 - n = 2 ** 16 - - # create a sketch and inject enough points to force compaction - quantiles = quantiles_items_sketch(k) - for i in range(0, n): - quantiles.update(str(i)) - - quantiles_copy = quantiles_items_sketch(quantiles) - quantiles.merge(quantiles_copy) - self.assertEqual(quantiles.get_n(), 2 * n) - - quantiles_bytes = quantiles.serialize(PyStringsSerDe()) - new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe()) - self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained()) - self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value()) - self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value()) - self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7)) - self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4))) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/req_test.py b/python/tests/req_test.py deleted file mode 100644 index b78b200e..00000000 --- a/python/tests/req_test.py +++ /dev/null @@ -1,159 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe -import numpy as np - -class reqTest(unittest.TestCase): - def test_req_example(self): - k = 12 - n = 2 ** 20 - - # create a sketch and inject ~1 million N(0,1) points as an array and as a single item - req = req_floats_sketch(k, True) # high rank accuracy - req.update(np.random.normal(size=n-1)) - req.update(0.0) - - # 0 should be near the median - self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.045) - - # the median should be near 0 - self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.045) - - # we also track the min/max independently from the rest of the data - # which lets us know the full observed data range - self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01)) - self.assertLessEqual(0.0, req.get_rank(req.get_min_value())) - self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99)) - self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value())) - - # we can also extract a list of values at a time, - # here the values should give us something close to [-2, -1, 0, 1, 2]. - # then get the CDF, which will return something close to - # the original values used in get_quantiles() - # finally, can check the normalized rank error bound - pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) - cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass - self.assertEqual(len(cdf), len(pts)+1) - - # For relative error quantiles, the error depends on the actual rank - # so we need to use that to detemrine the bounds - est = req.get_rank(0.999, True) - lb = req.get_rank_lower_bound(est, 1) - ub = req.get_rank_upper_bound(est, 1) - self.assertLessEqual(lb, est) - self.assertLessEqual(est, ub) - - # and a few basic queries about the sketch - self.assertFalse(req.is_empty()) - self.assertTrue(req.is_estimation_mode()) - self.assertEqual(req.get_n(), n) - self.assertLess(req.get_num_retained(), n) - self.assertEqual(req.get_k(), k) - - # merging itself will double the number of items the sketch has seen - req_copy = req_floats_sketch(req) - req.merge(req_copy) - self.assertEqual(req.get_n(), 2*n) - - # we can then serialize and reconstruct the sketch - req_bytes = req.serialize() - new_req = req_floats_sketch.deserialize(req_bytes) - self.assertEqual(req.get_num_retained(), new_req.get_num_retained()) - self.assertEqual(req.get_min_value(), new_req.get_min_value()) - self.assertEqual(req.get_max_value(), new_req.get_max_value()) - self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7)) - self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0)) - - total_weight = 0 - for tuple in req: - item = tuple[0] - weight = tuple[1] - total_weight = total_weight + weight - self.assertEqual(total_weight, req.get_n()) - - def test_req_ints_sketch(self): - k = 100 - n = 10 - req = req_ints_sketch(k) - for i in range(0, n): - req.update(i) - - self.assertEqual(req.get_min_value(), 0) - self.assertEqual(req.get_max_value(), n-1) - self.assertEqual(req.get_n(), n) - self.assertFalse(req.is_empty()) - self.assertFalse(req.is_estimation_mode()) # n < k - self.assertEqual(req.get_k(), k) - - pmf = req.get_pmf([round(n/2)]) - self.assertIsNotNone(pmf) - self.assertEqual(len(pmf), 2) - - cdf = req.get_cdf([round(n/2)]) - self.assertIsNotNone(cdf) - self.assertEqual(len(cdf), 2) - - self.assertEqual(req.get_quantile(0.5), round(n/2)) - quants = req.get_quantiles([0.25, 0.5, 0.75]) - self.assertIsNotNone(quants) - self.assertEqual(len(quants), 3) - - self.assertEqual(req.get_rank(round(n/2)), 0.5) - - # merge self - req_copy = req_ints_sketch(req) - req.merge(req_copy) - self.assertEqual(req.get_n(), 2 * n) - - sk_bytes = req.serialize() - self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch)) - - def test_req_floats_sketch(self): - # already tested floats with LRA so just check that HRA works - k = 75 - req = req_floats_sketch(k, False) # low rank accuracy - self.assertTrue(req.is_empty()) - self.assertFalse(req.is_hra()) - - def test_req_items_sketch(self): - # most functionality has been tested, but we need to ensure objects and sorting work - # as well as serialization - k = 100 - n = 2 ** 16 - - # create a sketch and inject enough points to force compaction - req = req_items_sketch(k) - for i in range(0, n): - req.update(str(i)) - - req_copy = req_items_sketch(req) - req.merge(req_copy) - self.assertEqual(req.get_n(), 2 * n) - - req_bytes = req.serialize(PyStringsSerDe()) - new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe()) - self.assertEqual(req.get_num_retained(), new_req.get_num_retained()) - self.assertEqual(req.get_min_value(), new_req.get_min_value()) - self.assertEqual(req.get_max_value(), new_req.get_max_value()) - self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7)) - self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4))) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py deleted file mode 100644 index f2798c48..00000000 --- a/python/tests/theta_test.py +++ /dev/null @@ -1,156 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest - -from datasketches import theta_sketch, update_theta_sketch -from datasketches import compact_theta_sketch, theta_union -from datasketches import theta_intersection, theta_a_not_b -from datasketches import theta_jaccard_similarity - -class ThetaTest(unittest.TestCase): - def test_theta_basic_example(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k unique values - - # create a sketch and inject some values - sk = self.generate_theta_sketch(n, lgk) - - # we can check that the upper and lower bounds bracket the - # estimate, without needing to know the exact value. - self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate()) - self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate()) - - # because this sketch is deterministically generated, we can - # also compare against the exact value - self.assertLessEqual(sk.get_lower_bound(1), n) - self.assertGreaterEqual(sk.get_upper_bound(1), n) - - # compact and serialize for storage, then reconstruct - sk_bytes = sk.compact().serialize() - new_sk = compact_theta_sketch.deserialize(sk_bytes) - - # estimate remains unchanged - self.assertFalse(sk.is_empty()) - self.assertEqual(sk.get_estimate(), new_sk.get_estimate()) - - count = 0 - for hash in new_sk: - self.assertLess(hash, new_sk.get_theta64()) - count = count + 1 - self.assertEqual(count, new_sk.get_num_retained()) - - num = sk.get_num_retained() - sk.trim() - self.assertLessEqual(sk.get_num_retained(), num) - - sk.reset() - self.assertTrue(sk.is_empty()) - self.assertEqual(sk.get_num_retained(), 0) - - def test_theta_set_operations(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k unique values - - # we'll have 1/4 of the values overlap - offset = int(3 * n / 4) # it's a float w/o cast - - # create a couple sketches and inject some values - sk1 = self.generate_theta_sketch(n, lgk) - sk2 = self.generate_theta_sketch(n, lgk, offset) - - # UNIONS - # create a union object - union = theta_union(lgk) - union.update(sk1) - union.update(sk2) - - # getting result from union returns a compact_theta_sketch - # compact theta sketches can be used in additional unions - # or set operations but cannot accept further item updates - result = union.get_result() - self.assertTrue(isinstance(result, compact_theta_sketch)) - - # since our process here is deterministic, we have - # checked and know the exact answer is within one - # standard deviation of the estimate - self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4) - - # INTERSECTIONS - # create an intersection object - intersect = theta_intersection() # no lg_k - intersect.update(sk1) - intersect.update(sk2) - - # has_result() indicates the intersection has been used, - # although the result may be the empty set - self.assertTrue(intersect.has_result()) - - # as with unions, the result is a compact sketch - result = intersect.get_result() - self.assertTrue(isinstance(result, compact_theta_sketch)) - - # we know the sets overlap by 1/4 - self.assertLessEqual(result.get_lower_bound(1), n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), n / 4) - - # A NOT B - # create an a_not_b object - anb = theta_a_not_b() # no lg_k - result = anb.compute(sk1, sk2) - - # as with unions, the result is a compact sketch - self.assertTrue(isinstance(result, compact_theta_sketch)) - - # we know the sets overlap by 1/4, so the remainder is 3/4 - self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4) - - - # JACCARD SIMILARITY - # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound) - jac = theta_jaccard_similarity.jaccard(sk1, sk2) - - # we can check that results are in the expected order - self.assertLess(jac[0], jac[1]) - self.assertLess(jac[1], jac[2]) - - # checks for sketch equivalency - self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1)) - self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2)) - - # we can apply a check for similarity or dissimilarity at a - # given threshhold, at 97.7% confidence. - - # check that the Jaccard Index is at most (upper bound) 0.2. - # exact result would be 1/7 - self.assertTrue(theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2)) - - # check that the Jaccard Index is at least (lower bound) 0.7 - # exact result would be 3/4, using result from A NOT B test - self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7)) - - - def generate_theta_sketch(self, n, lgk, offset=0): - sk = update_theta_sketch(lgk) - for i in range(0, n): - sk.update(i + offset) - return sk - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py deleted file mode 100644 index 6327599f..00000000 --- a/python/tests/tuple_test.py +++ /dev/null @@ -1,213 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest - -from datasketches import update_tuple_sketch -from datasketches import compact_tuple_sketch, tuple_union -from datasketches import tuple_intersection, tuple_a_not_b -from datasketches import tuple_jaccard_similarity -from datasketches import tuple_jaccard_similarity, PyIntsSerDe -from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy -from datasketches import update_theta_sketch - -class TupleTest(unittest.TestCase): - def test_tuple_basic_example(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k unique values - - # create a sketch and inject some values -- summary is 2 so we can sum them - # and know the reuslt - sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2) - - # we can check that the upper and lower bounds bracket the - # estimate, without needing to know the exact value. - self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate()) - self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate()) - - # because this sketch is deterministically generated, we can - # also compare against the exact value - self.assertLessEqual(sk.get_lower_bound(1), n) - self.assertGreaterEqual(sk.get_upper_bound(1), n) - - # compact and serialize for storage, then reconstruct - sk_bytes = sk.compact().serialize(PyIntsSerDe()) - new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe()) - - # estimate remains unchanged - self.assertFalse(sk.is_empty()) - self.assertEqual(sk.get_estimate(), new_sk.get_estimate()) - - # we can also iterate over the sketch entries - # the iterator provides a (hashkey, summary) pair where the - # first value is the raw hash value and the second the summary - count = 0 - cumSum = 0 - for pair in new_sk: - self.assertLess(pair[0], new_sk.get_theta64()) - count += 1 - cumSum += pair[1] - self.assertEqual(count, new_sk.get_num_retained()) - self.assertEqual(cumSum, 2 * new_sk.get_num_retained()) - - # we can even create a tuple sketch from an existing theta sketch - # as long as we provide a summary to use - theta_sk = update_theta_sketch(lgk) - for i in range(n, 2*n): - theta_sk.update(i) - cts = compact_tuple_sketch(theta_sk, 5) - cumSum = 0 - for pair in cts: - cumSum += pair[1] - self.assertEqual(cumSum, 5 * cts.get_num_retained()) - - num = sk.get_num_retained() - sk.trim() - self.assertLessEqual(sk.get_num_retained(), num) - - sk.reset() - self.assertTrue(sk.is_empty()) - self.assertEqual(sk.get_num_retained(), 0) - - def test_tuple_set_operations(self): - lgk = 12 # 2^k = 4096 rows in the table - n = 1 << 18 # ~256k unique values - - # we'll have 1/4 of the values overlap - offset = int(3 * n / 4) # it's a float w/o cast - - # create a couple sketches and inject some values, with different summaries - sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5) - sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset) - - # UNIONS - # create a union object - union = tuple_union(MaxIntPolicy(), lgk) - union.update(sk1) - union.update(sk2) - - # getting result from union returns a compact_theta_sketch - # compact theta sketches can be used in additional unions - # or set operations but cannot accept further item updates - result = union.get_result() - self.assertTrue(isinstance(result, compact_tuple_sketch)) - - # since our process here is deterministic, we have - # checked and know the exact answer is within one - # standard deviation of the estimate - self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4) - - # we unioned two equal-sized sketches with overlap and used - # the max value as the resulting summary, meaning we should - # have more summaries with value 7 than value 5 in the result - count5 = 0 - count7 = 0 - for pair in result: - if pair[1] == 5: - count5 += 1 - elif pair[1] == 7: - count7 += 1 - else: - self.fail() - self.assertLess(count5, count7) - - # INTERSECTIONS - # create an intersection object - intersect = tuple_intersection(MinIntPolicy()) # no lg_k - intersect.update(sk1) - intersect.update(sk2) - - # has_result() indicates the intersection has been used, - # although the result may be the empty set - self.assertTrue(intersect.has_result()) - - # as with unions, the result is a compact sketch - result = intersect.get_result() - self.assertTrue(isinstance(result, compact_tuple_sketch)) - - # we know the sets overlap by 1/4 - self.assertLessEqual(result.get_lower_bound(1), n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), n / 4) - - # in this example, we intersected the sketches and took the - # min value as the resulting summary, so all summaries - # must be exactly equal to that value - count5 = 0 - for pair in result: - if pair[1] == 5: - count5 += 1 - else: - self.fail() - self.assertEqual(count5, result.get_num_retained()) - - # A NOT B - # create an a_not_b object - anb = tuple_a_not_b() # no lg_k or policy - result = anb.compute(sk1, sk2) - - # as with unions, the result is a compact sketch - self.assertTrue(isinstance(result, compact_tuple_sketch)) - - # we know the sets overlap by 1/4, so the remainder is 3/4 - self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4) - self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4) - - # here, we have only values with a summary of 5 as any keys that - # existed in both sketches were removed - count5 = 0 - for pair in result: - if pair[1] == 5: - count5 += 1 - else: - self.fail() - self.assertEqual(count5, result.get_num_retained()) - - # JACCARD SIMILARITY - # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound) - # and does not examine summaries, even for (dis)similarity tests. - jac = tuple_jaccard_similarity.jaccard(sk1, sk2) - - # we can check that results are in the expected order - self.assertLess(jac[0], jac[1]) - self.assertLess(jac[1], jac[2]) - - # checks for sketch equivalence - self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1)) - self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2)) - - # we can apply a check for similarity or dissimilarity at a - # given threshold, at 97.7% confidence. - - # check that the Jaccard Index is at most (upper bound) 0.2. - # exact result would be 1/7 - self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2)) - - # check that the Jaccard Index is at least (lower bound) 0.7 - # exact result would be 3/4, using result from A NOT B test - self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7)) - - - # Generates a basic tuple sketch with a fixed value for each update - def generate_tuple_sketch(self, policy, n, lgk, value, offset=0): - sk = update_tuple_sketch(policy, lgk) - for i in range(0, n): - sk.update(i + offset, value) - return sk - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/vector_of_kll_test.py b/python/tests/vector_of_kll_test.py deleted file mode 100644 index b61b1be0..00000000 --- a/python/tests/vector_of_kll_test.py +++ /dev/null @@ -1,148 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import (vector_of_kll_ints_sketches, - vector_of_kll_floats_sketches) -import numpy as np - -class VectorOfKllSketchesTest(unittest.TestCase): - def test_vector_of_kll_floats_sketches_example(self): - k = 200 - d = 3 - n = 2 ** 20 - - # create a sketch and inject ~1 million N(0,1) points - kll = vector_of_kll_floats_sketches(k, d) - # Track the min/max for each sketch to test later - smin = np.zeros(d) + np.inf - smax = np.zeros(d) - np.inf - - for i in range(0, n): - dat = np.random.randn(d) - smin = np.amin([smin, dat], axis=0) - smax = np.amax([smax, dat], axis=0) - kll.update(dat) - - # 0 should be near the median - np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.035) - # the median should be near 0 - np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.035) - # we also track the min/max independently from the rest of the data - # which lets us know the full observed data range - np.testing.assert_allclose(kll.get_min_values(), smin) - np.testing.assert_allclose(kll.get_max_values(), smax) - np.testing.assert_array_less(kll.get_min_values(), kll.get_quantiles(0.01)[:,0]) - np.testing.assert_array_less(kll.get_quantiles(0.99)[:,0], kll.get_max_values()) - - # we can also extract a list of values at a time, - # here the values should give us something close to [-2, -1, 0, 1, 2]. - # then get the CDF, which will return something close to - # the original values used in get_quantiles() - # finally, can check the normalized rank error bound - pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) - # use the mean pts for the CDF, include 1.0 at end to account for all probability mass - meanpts = np.mean(pts, axis=0) - cdf = kll.get_cdf(meanpts) - self.assertEqual(cdf.shape[0], pts.shape[0]) - self.assertEqual(cdf.shape[1], pts.shape[1]+1) - - # and a few basic queries about the sketch - self.assertFalse(np.all(kll.is_empty())) - self.assertTrue(np.all(kll.is_estimation_mode())) - self.assertTrue(np.all(kll.get_n() == n)) - self.assertTrue(np.all(kll.get_num_retained() < n)) - - # we can combine sketches across all dimensions and get the reuslt - result = kll.collapse() - self.assertEqual(result.get_n(), d * n) - - # merging a copy of itself will double the number of items the sketch has seen - kll_copy = vector_of_kll_floats_sketches(kll) - kll.merge(kll_copy) - np.testing.assert_equal(kll.get_n(), 2*n) - - # we can then serialize and reconstruct the sketch - kll_bytes = kll.serialize() # serializes each sketch as a list - new_kll = vector_of_kll_floats_sketches(k, d) - for s in range(len(kll_bytes)): - new_kll.deserialize(kll_bytes[s], s) - - # everything should be exactly equal - np.testing.assert_equal(kll.get_num_retained(), new_kll.get_num_retained()) - np.testing.assert_equal;(kll.get_min_values(), new_kll.get_min_values()) - np.testing.assert_equal(kll.get_max_values(), new_kll.get_max_values()) - np.testing.assert_equal(kll.get_quantiles(0.7), new_kll.get_quantiles(0.7)) - np.testing.assert_equal(kll.get_ranks(0.0), new_kll.get_ranks(0.0)) - - def test_kll_ints_sketches(self): - # already tested floats and it's templatized, so just make sure it instantiates properly - k = 100 - d = 5 - kll = vector_of_kll_ints_sketches(k, d) - self.assertTrue(np.all(kll.is_empty())) - - def test_kll_2Dupdates(self): - # 1D case tested in the first example - # 2D case will follow same idea, but focusing on update() - k = 200 - d = 3 - # we'll do ~250k updates of 4 values each (total ~1mil updates, as above) - n = 2 ** 18 - nbatch = 4 - - # create a sketch and inject ~1 million N(0,1) points - kll = vector_of_kll_floats_sketches(k, d) - # Track the min/max for each sketch to test later - smin = np.zeros(d) + np.inf - smax = np.zeros(d) - np.inf - - for i in range(0, n): - dat = np.random.randn(nbatch, d) - smin = np.amin(np.row_stack((smin, dat)), axis=0) - smax = np.amax(np.row_stack((smax, dat)), axis=0) - kll.update(dat) - - # 0 should be near the median - np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.035) - # the median should be near 0 - np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.035) - # we also track the min/max independently from the rest of the data - # which lets us know the full observed data range - np.testing.assert_allclose(kll.get_min_values(), smin) - np.testing.assert_allclose(kll.get_max_values(), smax) - - def test_kll_3Dupdates(self): - # now test 3D update, which should fail - k = 200 - d = 3 - - # create a sketch - kll = vector_of_kll_floats_sketches(k, d) - - # we'll try 1 3D update - dat = np.random.randn(10, 7, d) - try: - kll.update(dat) - except: - # this is what we expect - pass - # the sketches should still be empty - self.assertTrue(np.all(kll.is_empty())) - -if __name__ == '__main__': - unittest.main() diff --git a/python/tests/vo_test.py b/python/tests/vo_test.py deleted file mode 100644 index 4fbca41a..00000000 --- a/python/tests/vo_test.py +++ /dev/null @@ -1,132 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe - -class VoTest(unittest.TestCase): - def test_vo_example(self): - k = 50 # a small value so we can easily fill the sketch - vo = var_opt_sketch(k) - - # varopt sampling reduces to standard reservoir sampling - # if the items are all equally weighted, although the - # algorithm will be significantly slower than an optimized - # reservoir sampler - n = 5 * k - for i in range(0, n): - vo.update(i) - - # we can also add a heavy item, using a negative weight for - # easy filtering later. keep in mind that "heavy" is a - # relative concept, so using a fixed multiple of n may not - # be considered a heavy item for larger values of n - vo.update(-1, 1000 * n) - self.assertEqual(k, vo.k) - self.assertEqual(k, vo.num_samples) - self.assertEqual(n + 1, vo.n) - self.assertFalse(vo.is_empty()) - - # we can easily get the list of items in the sample - items = vo.get_samples() - self.assertEqual(len(items), k) - - count = 0 - for tuple in vo: - sample = tuple[0] - weight = tuple[1] - count = count + 1 - self.assertEqual(count, vo.num_samples) - - # we can also apply a predicate to the sketch to get an estimate - # (with optimally minimal variance) of the subset sum of items - # matching that predicate among the entire population - - # we'll use a lambda here, but any function operating on a single - # item which returns a boolean value should work - summary = vo.estimate_subset_sum(lambda x: x < 0) - self.assertEqual(summary['estimate'], 1000 * n) - self.assertEqual(summary['total_sketch_weight'], 1001 * n) - - # a regular function is similarly handled - def geq_zero(x): - return x >= 0 - summary = vo.estimate_subset_sum(geq_zero) - self.assertEqual(summary['estimate'], n) - self.assertEqual(summary['total_sketch_weight'], 1001 * n) - - # next we'll create a second, smaller sketch with - # only heavier items relative to the previous sketch, - # but with the sketch in sampling mode - k2 = 5 - vo2 = var_opt_sketch(k2) - # for weight, use the estimate of all items >=0 from before - wt = summary['estimate'] - for i in range(0, k2 + 1): - vo2.update((2 * n) + i, wt) - - # now union the sketches, demonstrating how the - # union's k may not be equal to that of either - # input value - union = var_opt_union(k) - union.update(vo) - union.update(vo2) - - result = union.get_result() - self.assertEqual(n + k2 + 2, result.n) - self.assertFalse(result.is_empty()) - self.assertGreater(result.k, k2) - self.assertLess(result.k, k) - - # we can compare what information is available from both - # the union and a sketch. - print(union) - - # if we want to print the list of items, there must be a - # __str__() method for each item (which need not be the same - # type; they're all generic python objects when used from - # python), otherwise you may trigger an exception. - # to_string() is provided as a convenience to avoid direct - # calls to __str__() with parameters. - print(result.to_string(True)) - - # finally, we can serialize the sketch by providing an - # appropriate serde class. - expected_size = result.get_serialized_size_bytes(PyIntsSerDe()) - b = result.serialize(PyIntsSerDe()) - self.assertEqual(expected_size, len(b)) - - # if we try to deserialize with the wrong serde, things break - try: - var_opt_sketch.deserialize(b, PyStringsSerDe()) - self.fail() - except: - # expected; do nothing - self.assertTrue(True) - - # using the correct serde gives us back a copy of the original - rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe()) - self.assertEqual(result.k, rebuilt.k) - self.assertEqual(result.num_samples, rebuilt.num_samples) - self.assertEqual(result.n, rebuilt.n) - summary1 = result.estimate_subset_sum(geq_zero) - summary2 = rebuilt.estimate_subset_sum(geq_zero) - self.assertEqual(summary1['estimate'], summary2['estimate']) - self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight']) - -if __name__ == '__main__': - unittest.main() diff --git a/setup.py b/setup.py deleted file mode 100644 index 40b5949f..00000000 --- a/setup.py +++ /dev/null @@ -1,110 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Modified from: -# http://www.benjack.io/2018/02/02/python-cpp-revisited.html - -import os -import sys -import platform -import subprocess -import re -from datetime import datetime, timezone - -from setuptools import setup, find_packages, Extension -from setuptools.command.build_ext import build_ext - -class CMakeExtension(Extension): - def __init__(self, name, sourcedir=''): - Extension.__init__(self, name, sources=[]) - self.sourcedir = os.path.abspath(sourcedir) - -class CMakeBuild(build_ext): - def run(self): - try: - subprocess.check_output(['cmake', '--version']) - except OSError: - raise RuntimeError( - "CMake >= 3.12 must be installed to build the following extensions: " + - ", ".join(e.name for e in self.extensions)) - - for ext in self.extensions: - self.build_extension(ext) - - def build_extension(self, ext): - extdir = os.path.abspath( - os.path.dirname(self.get_ext_fullpath(ext.name))) - cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir] - cmake_args += ['-DWITH_PYTHON=True'] - cmake_args += ['-DCMAKE_CXX_STANDARD=11'] - # ensure we use a consistent python version - cmake_args += ['-DPython3_EXECUTABLE=' + sys.executable] - cfg = 'Debug' if self.debug else 'Release' - build_args = ['--config', cfg] - - if platform.system() == "Windows": - cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format( - cfg.upper(), - extdir)] - if sys.maxsize > 2**32: - cmake_args += ['-T', 'host=x64'] - cmake_args += ['-DCMAKE_GENERATOR_PLATFORM=x64'] - build_args += ['--', '/m'] - else: - cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] - build_args += ['--', '-j2'] - - env = os.environ.copy() - env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format( - env.get('CXXFLAGS', ''), - self.distribution.get_version()) - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) - subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, - cwd=self.build_temp, env=env) - subprocess.check_call(['cmake', '--build', '.', '--target', 'python'] + build_args, - cwd=self.build_temp, env=env) - print() # add an empty line to pretty print - -# Read and parse the version format -# @DT@ -> datestamp -# @HHMM@ -> .devHHMM to indicate development version -# Releases should have a fixed version with no @ variables -with open('version.cfg.in', 'r') as file: - ds_version = file.read().rstrip() -dt = datetime.now(timezone.utc) -ds_version = re.sub('@DT@', dt.strftime('%Y%m%d'), ds_version) -ds_version = re.sub('@HHMM@', 'dev' + dt.strftime('%H%M'), ds_version) - -setup( - name='datasketches', - version=ds_version, - author='Apache Software Foundation', - author_email='dev@datasketches.apache.org', - description='The Apache DataSketches Library for Python', - license='Apache License 2.0', - url='http://datasketches.apache.org', - long_description=open('python/README.md').read(), - long_description_content_type='text/markdown', - packages=find_packages(where='python',exclude=['src','include','*tests*']), # src not needed if only the .so - package_dir={'':'python'}, - # may need to add all source paths for sdist packages w/o MANIFEST.in - ext_modules=[CMakeExtension('datasketches')], - cmdclass={'build_ext': CMakeBuild}, - install_requires=['numpy'], - zip_safe=False -) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index e5ae372f..00000000 --- a/tox.ini +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[tox] -envlist = py3 -isolated_build = true - -[testenv] -deps = pytest - numpy -changedir = python/tests -commands = pytest From e4e626c6c64cd0ea29ac11ebb37d7774f87e004d Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 6 Sep 2023 15:06:52 -0700 Subject: [PATCH 2/3] remove python, remove unnecessary setting in recommended cmake dependency --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index 7607ac07..bad8503c 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,7 @@ If you are interested in making contributions to this site please see our [Commu This code requires C++11. -This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python). - -This library is header-only. The build process provided is only for building unit tests and the python library. +This library is header-only. The build process provided is only for building unit tests. Building the unit tests requires cmake 3.12.0 or higher. @@ -92,13 +90,6 @@ from GitHub using CMake's `ExternalProject` module. The code would look somethin GIT_SUBMODULES "" INSTALL_DIR /tmp/datasketches-prefix CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix - - # Override the install command to add DESTDIR - # This is necessary to work around an oddity in the RPM (but not other) package - # generation, as CMake otherwise picks up the Datasketch files when building - # an RPM for a dependent package. (RPM scans the directory for files in addition to installing - # those files referenced in an "install" rule in the cmake file) - INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install ) ExternalProject_Get_property(datasketches INSTALL_DIR) set(datasketches_INSTALL_DIR ${INSTALL_DIR}) From 3e777f7e005939131d84c4a28af861a8bfb718c4 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 7 Sep 2023 11:54:30 -0700 Subject: [PATCH 3/3] restored recommended override as it might be necessary --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index bad8503c..1ecffe57 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,13 @@ from GitHub using CMake's `ExternalProject` module. The code would look somethin GIT_SUBMODULES "" INSTALL_DIR /tmp/datasketches-prefix CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix + + # Override the install command to add DESTDIR + # This is necessary to work around an oddity in the RPM (but not other) package + # generation, as CMake otherwise picks up the Datasketch files when building + # an RPM for a dependent package. (RPM scans the directory for files in addition to installing + # those files referenced in an "install" rule in the cmake file) + INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install ) ExternalProject_Get_property(datasketches INSTALL_DIR) set(datasketches_INSTALL_DIR ${INSTALL_DIR})