From 67a213d398388ebf8050a26454b541a23d0a336b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 3 Sep 2024 16:28:50 -0400 Subject: [PATCH] PyVortex PyVortex -------- The generated documentation for this branch is available at https://spiraldb.github.io/vortex/docs/ The Python package is now structured like this: - `vortex` - `array()`: converts a list or an Arrow array into a Vortex array. - `encodings` - `Array`: In Rust this is called a PyArray and it is just PyO3 wrapper around a Vortex Rust Array. - `to_pandas` - `to_numpy` - `compress()`: compresses an Array. - `dtype`: A module containing dtype constructors, e.g. `uint(32, nullable=False)` - `io`: Readers and writers which currently only work for Struct arrays without top-level nulls. - `read()` - `write()` - `expr` - `Expr`: a class, implemented in Rust, which constructs vortex-exprs using the obvious Python operators. I also added `python_repr` which returns a Display-able struct that renders itself in the Python `repr` style. In particular, the dtypes look like `uint(32, False)` rather than `u32`. I think the only bugfixes in this PR are: 1. pyvortex/src/encode.rs: propagate the nullability from Arrow to `Array::from_arrow`. 2. arrow/recordbatch.rs and arrow/dtype.rs need to return compatible nullability and validity. Future Work ----------- 1. Automatically generate and deploy the documentation to github.io. 2. Run `cd pyvortex/docs && make doctest` on every commit. --- .gitignore | 5 +- Cargo.lock | 7 + pyvortex/Cargo.toml | 8 + pyvortex/docs/Makefile | 20 ++ pyvortex/docs/conf.py | 40 ++++ pyvortex/docs/dtype.rst | 7 + pyvortex/docs/encoding.rst | 7 + pyvortex/docs/expr.rst | 6 + pyvortex/docs/index.rst | 19 ++ pyvortex/docs/io.rst | 6 + pyvortex/docs/make.bat | 35 +++ pyvortex/pyproject.toml | 9 +- pyvortex/python/vortex/__init__.py | 7 +- pyvortex/python/vortex/encoding.py | 168 +++++++++++++++ pyvortex/src/array.rs | 123 ++++++++++- pyvortex/src/compress.rs | 76 +++---- pyvortex/src/dtype.rs | 258 ++++++++++++++++++++++- pyvortex/src/encode.rs | 14 +- pyvortex/src/expr.rs | 293 ++++++++++++++++++++++++++ pyvortex/src/io.rs | 255 ++++++++++++++++++++++ pyvortex/src/lib.rs | 104 +++------ pyvortex/src/vortex_arrow.rs | 48 ----- pyvortex/test/test_array.py | 14 +- pyvortex/test/test_compress.py | 34 +-- pyvortex/test/test_dtype.py | 12 +- requirements-dev.lock | 80 +++++++ requirements.lock | 57 +++++ vortex-array/src/array/struct_/mod.rs | 6 +- vortex-array/src/arrow/dtype.rs | 2 +- vortex-array/src/arrow/recordbatch.rs | 2 +- vortex-array/src/compute/take.rs | 8 +- vortex-dtype/src/dtype.rs | 49 +++++ vortex-dtype/src/extension.rs | 32 +++ vortex-dtype/src/nullability.rs | 19 ++ 34 files changed, 1618 insertions(+), 212 deletions(-) create mode 100644 pyvortex/docs/Makefile create mode 100644 pyvortex/docs/conf.py create mode 100644 pyvortex/docs/dtype.rst create mode 100644 pyvortex/docs/encoding.rst create mode 100644 pyvortex/docs/expr.rst create mode 100644 pyvortex/docs/index.rst create mode 100644 pyvortex/docs/io.rst create mode 100644 pyvortex/docs/make.bat create mode 100644 pyvortex/python/vortex/encoding.py create mode 100644 pyvortex/src/expr.rs create mode 100644 pyvortex/src/io.rs delete mode 100644 pyvortex/src/vortex_arrow.rs diff --git a/.gitignore b/.gitignore index a1d72fa9fe..37dccd67a7 100644 --- a/.gitignore +++ b/.gitignore @@ -72,7 +72,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +pyvortex/docs/_build/ # PyBuilder .pybuilder/ @@ -196,3 +196,6 @@ data/ # vscode .vscode/ + +# Emacs +*~ diff --git a/Cargo.lock b/Cargo.lock index 90cd4b35c0..b5a3fac37a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3077,18 +3077,25 @@ name = "pyvortex" version = "0.7.0" dependencies = [ "arrow", + "flexbuffers", + "futures", "log", "paste", "pyo3", "pyo3-log", + "tokio", "vortex-alp", "vortex-array", "vortex-dict", "vortex-dtype", "vortex-error", + "vortex-expr", "vortex-fastlanes", "vortex-roaring", "vortex-runend", + "vortex-sampling-compressor", + "vortex-scalar", + "vortex-serde", "vortex-zigzag", ] diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 9f835cb002..c8ca8b86d5 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -18,21 +18,29 @@ workspace = true [lib] name = "pyvortex" crate-type = ["rlib", "cdylib"] +doctest = false [dependencies] arrow = { workspace = true, features = ["pyarrow"] } +flexbuffers = { workspace = true } +futures = { workspace = true } log = { workspace = true } paste = { workspace = true } pyo3 = { workspace = true } pyo3-log = { workspace = true } +tokio = { workspace = true, features = ["fs"] } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-dict = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-expr = { workspace = true } vortex-fastlanes = { workspace = true } vortex-roaring = { workspace = true } vortex-runend = { workspace = true } +vortex-sampling-compressor = { workspace = true } +vortex-serde = { workspace = true, features = ["tokio"] } +vortex-scalar = { workspace = true } vortex-zigzag = { workspace = true } # We may need this workaround? diff --git a/pyvortex/docs/Makefile b/pyvortex/docs/Makefile new file mode 100644 index 0000000000..5117fbf5b3 --- /dev/null +++ b/pyvortex/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -W --keep-going +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/pyvortex/docs/conf.py b/pyvortex/docs/conf.py new file mode 100644 index 0000000000..5beff0b1c2 --- /dev/null +++ b/pyvortex/docs/conf.py @@ -0,0 +1,40 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "Vortex" +copyright = "2024, Spiral" +author = "Spiral" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.doctest", +] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), + "numpy": ("https://numpy.org/doc/stable/", None), +} + +nitpicky = True # ensures all :class:, :obj:, etc. links are valid + +doctest_global_setup = "import pyarrow; import vortex" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "pydata_sphinx_theme" +# html_static_path = ['_static'] # no static files yet diff --git a/pyvortex/docs/dtype.rst b/pyvortex/docs/dtype.rst new file mode 100644 index 0000000000..9c30bc80b9 --- /dev/null +++ b/pyvortex/docs/dtype.rst @@ -0,0 +1,7 @@ +Array Data Types +================ + +.. automodule:: vortex.dtype + :members: + :imported-members: + diff --git a/pyvortex/docs/encoding.rst b/pyvortex/docs/encoding.rst new file mode 100644 index 0000000000..8448777fba --- /dev/null +++ b/pyvortex/docs/encoding.rst @@ -0,0 +1,7 @@ +Arrays +====== + +.. automodule:: vortex.encoding + :members: + :imported-members: + :special-members: __len__ diff --git a/pyvortex/docs/expr.rst b/pyvortex/docs/expr.rst new file mode 100644 index 0000000000..854aec35ce --- /dev/null +++ b/pyvortex/docs/expr.rst @@ -0,0 +1,6 @@ +Row Filter Expressions +====================== + +.. automodule:: vortex.expr + :members: + :imported-members: diff --git a/pyvortex/docs/index.rst b/pyvortex/docs/index.rst new file mode 100644 index 0000000000..d45218ad06 --- /dev/null +++ b/pyvortex/docs/index.rst @@ -0,0 +1,19 @@ +.. Vortex documentation master file, created by + sphinx-quickstart on Wed Aug 28 10:10:21 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Vortex documentation +==================== + +Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + encoding + dtype + io + expr + diff --git a/pyvortex/docs/io.rst b/pyvortex/docs/io.rst new file mode 100644 index 0000000000..f2cc405ce9 --- /dev/null +++ b/pyvortex/docs/io.rst @@ -0,0 +1,6 @@ +Input and Output +================ + +.. automodule:: vortex.io + :members: + :imported-members: diff --git a/pyvortex/docs/make.bat b/pyvortex/docs/make.bat new file mode 100644 index 0000000000..32bb24529f --- /dev/null +++ b/pyvortex/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/pyvortex/pyproject.toml b/pyvortex/pyproject.toml index f472a8dbb7..aa5dfd129a 100644 --- a/pyvortex/pyproject.toml +++ b/pyvortex/pyproject.toml @@ -5,7 +5,9 @@ description = "Add your description here" authors = [ { name = "Nicholas Gates", email = "nick@nickgates.com" } ] -dependencies = [] +dependencies = [ + "pydata-sphinx-theme>=0.15.4", +] requires-python = ">= 3.11" classifiers = ["Private :: Do Not Upload"] @@ -17,7 +19,10 @@ build-backend = "maturin" managed = true dev-dependencies = [ "pyarrow>=15.0.0", - "pip" + "pip", + "sphinx>=8.0.2", + "ipython>=8.26.0", + "pandas>=2.2.2", ] [tool.maturin] diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py index 5e75e016eb..a860b4432c 100644 --- a/pyvortex/python/vortex/__init__.py +++ b/pyvortex/python/vortex/__init__.py @@ -1,4 +1,7 @@ -from ._lib import * # noqa: F403 -from ._lib import __doc__ as module_docs +from ._lib import __doc__ as module_docs, io, expr, dtype +from . import encoding + __doc__ = module_docs +del module_docs +array = encoding.array diff --git a/pyvortex/python/vortex/encoding.py b/pyvortex/python/vortex/encoding.py new file mode 100644 index 0000000000..c16c177d90 --- /dev/null +++ b/pyvortex/python/vortex/encoding.py @@ -0,0 +1,168 @@ +from typing import Union, Optional + +import pyarrow + +from ._lib import encoding as _encoding + +__doc__ = _encoding.__doc__ + +Array = _encoding.Array +compress = _encoding.compress + + +def _Array_to_pandas(self: _encoding.Array, *, name: Optional[str] = None, flatten: bool = False): + """Construct a Pandas dataframe from this Vortex array. + + Parameters + ---------- + obj : :class:`pyarrow.Array` or :class:`list` + The elements of this array or list become the elements of the Vortex array. + + name : :class:`str`, optional + The name of the column in the newly created dataframe. If unspecified, use `x`. + + flatten : :class:`bool` + If :obj:`True`, Struct columns are flattened in the dataframe. See the examples. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + + Construct a :class:`.pandas.DataFrame` with one column named `animals` from the contents of a Vortex + array: + + >>> array = vortex.encoding.array(['dog', 'cat', 'mouse', 'rat']) + >>> array.to_pandas(name='animals') + animals + 0 dog + 1 cat + 2 mouse + 3 rat + + Construct a :class:`.pandas.DataFrame` with the default column name: + + >>> array = vortex.encoding.array(['dog', 'cat', 'mouse', 'rat']) + >>> array.to_pandas() + x + 0 dog + 1 cat + 2 mouse + 3 rat + + Construct a dataframe with a Struct-typed column: + + >>> array = vortex.encoding.array([ + ... {'name': 'Joseph', 'age': 25}, + ... {'name': 'Narendra', 'age': 31}, + ... {'name': 'Angela', 'age': 33}, + ... {'name': 'Mikhail', 'age': 57}, + ... ]) + >>> array.to_pandas() + x + 0 {'age': 25, 'name': 'Joseph'} + 1 {'age': 31, 'name': 'Narendra'} + 2 {'age': 33, 'name': 'Angela'} + 3 {'age': 57, 'name': 'Mikhail'} + + Lift the struct fields to the top-level in the dataframe: + + >>> array.to_pandas(flatten=True) + x.age x.name + 0 25 Joseph + 1 31 Narendra + 2 33 Angela + 3 57 Mikhail + + """ + name = name or "x" + table = pyarrow.Table.from_arrays([self.to_arrow()], [name]) + if flatten: + table = table.flatten() + return table.to_pandas() + + +Array.to_pandas = _Array_to_pandas + + +def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True): + """Construct a NumPy array from this Vortex array. + + This is an alias for :code:`self.to_arrow().to_numpy(zero_copy_only)` + + Returns + ------- + :class:`numpy.ndarray` + + Examples + -------- + + Construct an ndarray from a Vortex array: + + >>> array = vortex.encoding.array([1, 0, 0, 1]) + >>> array.to_numpy() + array([1, 0, 0, 1]) + + """ + return self.to_arrow().to_numpy(zero_copy_only=zero_copy_only) + + +Array.to_numpy = _Array_to_numpy + + +def array(obj: Union[pyarrow.Array, list]) -> Array: + """The main entry point for creating Vortex arrays from other Python objects. + + This function is also available as ``vortex.array``. + + Parameters + ---------- + obj : :class:`pyarrow.Array` or :class:`list` + The elements of this array or list become the elements of the Vortex array. + + Returns + ------- + :class:`vortex.encoding.Array` + + Examples + -------- + + A Vortex array containing the first three integers. + + >>> vortex.encoding.array([1, 2, 3]).to_arrow() + + [ + 1, + 2, + 3 + ] + + The same Vortex array with a null value in the third position. + + >>> vortex.encoding.array([1, 2, None, 3]).to_arrow() + + [ + 1, + 2, + null, + 3 + ] + + Initialize a Vortex array from an Arrow array: + + >>> arrow = pyarrow.array(['Hello', 'it', 'is', 'me']) + >>> vortex.encoding.array(arrow).to_arrow() + + [ + "Hello", + "it", + "is", + "me" + ] + + """ + if isinstance(obj, list): + return _encoding._encode(pyarrow.array(obj)) + return _encoding._encode(obj) diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 3ec4166952..5e99ac86a8 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -1,12 +1,17 @@ +use arrow::array::{Array as ArrowArray, ArrayRef}; +use arrow::pyarrow::ToPyArrow; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use pyo3::types::{IntoPyDict, PyList}; +use vortex::array::ChunkedArray; use vortex::compute::take; -use vortex::{Array, ArrayDType}; +use vortex::{Array, ArrayDType, IntoCanonical}; use crate::dtype::PyDType; use crate::error::PyVortexError; -use crate::vortex_arrow; #[pyclass(name = "Array", module = "vortex", sequence, subclass)] +/// An array of zero or more *rows* each with the same set of *columns*. pub struct PyArray { inner: Array, } @@ -23,8 +28,65 @@ impl PyArray { #[pymethods] impl PyArray { + /// Convert this array to an Arrow array. + /// + /// Returns + /// ------- + /// :class:`pyarrow.Array` + /// + /// Examples + /// -------- + /// + /// Round-trip an Arrow array through a Vortex array: + /// + /// >>> vortex.encoding.array([1, 2, 3]).to_arrow() + /// + /// [ + /// 1, + /// 2, + /// 3 + /// ] fn to_arrow(self_: PyRef<'_, Self>) -> PyResult> { - vortex_arrow::export_array(self_.py(), &self_.inner) + // NOTE(ngates): for struct arrays, we could also return a RecordBatchStreamReader. + // NOTE(robert): Return RecordBatchStreamReader always? + let py = self_.py(); + let vortex = &self_.inner; + + if let Ok(chunked_array) = ChunkedArray::try_from(vortex) { + let chunks: Vec = chunked_array + .chunks() + .map(|chunk| -> PyResult { + Ok(chunk + .into_canonical() + .map_err(PyVortexError::map_err)? + .into_arrow()) + }) + .collect::>>()?; + if chunks.is_empty() { + return Err(PyValueError::new_err("No chunks in array")); + } + let pa_data_type = chunks[0].data_type().clone().to_pyarrow(py)?; + let chunks: PyResult> = chunks + .iter() + .map(|arrow_array| arrow_array.into_data().to_pyarrow(py)) + .collect(); + + // Combine into a chunked array + PyModule::import_bound(py, "pyarrow")?.call_method( + "chunked_array", + (PyList::new_bound(py, chunks?),), + Some(&[("type", pa_data_type)].into_py_dict_bound(py)), + ) + } else { + Ok(vortex + .clone() + .into_canonical() + .map_err(PyVortexError::map_err)? + .into_arrow() + .into_data() + .to_pyarrow(py)? + .into_bound(py)) + } } fn __len__(&self) -> usize { @@ -45,11 +107,66 @@ impl PyArray { self.inner.nbytes() } + /// The data type of this array. + /// + /// Returns + /// ------- + /// :class:`vortex.dtype.DType` + /// + /// Examples + /// -------- + /// + /// By default, :func:`vortex.encoding.array` uses the largest available bit-width: + /// + /// >>> vortex.encoding.array([1, 2, 3]).dtype + /// int(64, False) + /// + /// Including a :obj:`None` forces a nullable type: + /// + /// >>> vortex.encoding.array([1, None, 2, 3]).dtype + /// int(64, True) + /// + /// A UTF-8 string array: + /// + /// >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).dtype + /// utf8(False) #[getter] fn dtype(self_: PyRef) -> PyResult> { PyDType::wrap(self_.py(), self_.inner.dtype().clone()) } + /// Filter, permute, and/or repeat elements by their index. + /// + /// Returns + /// ------- + /// :class:`vortex.encoding.Array` + /// + /// Examples + /// -------- + /// + /// Keep only the first and third elements: + /// + /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.encoding.array([0, 2]) + /// >>> a.take(indices).to_arrow() + /// + /// [ + /// "a", + /// "c" + /// ] + /// + /// Permute and repeat the first and second elements: + /// + /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.encoding.array([0, 1, 1, 0]) + /// >>> a.take(indices).to_arrow() + /// + /// [ + /// "a", + /// "b", + /// "b", + /// "a" + /// ] fn take<'py>(&self, indices: PyRef<'py, Self>) -> PyResult> { take(&self.inner, indices.unwrap()) .map_err(PyVortexError::map_err) diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index 71c29be9fa..f029e0ccea 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -1,45 +1,45 @@ -use std::sync::Arc; - -use pyo3::types::PyType; -use pyo3::{pyclass, pyfunction, pymethods, Py, PyResult, Python}; -use vortex::compress::{CompressConfig, CompressCtx}; +use pyo3::prelude::*; +use vortex_sampling_compressor::SamplingCompressor; use crate::array::PyArray; use crate::error::PyVortexError; -#[derive(Clone)] -#[pyclass(name = "CompressConfig", module = "vortex")] -pub struct PyCompressConfig { - inner: CompressConfig, -} - -#[pymethods] -impl PyCompressConfig { - #[classmethod] - pub fn default(cls: &PyType) -> PyResult> { - Py::new(cls.py(), ::default()) - } -} - -impl Default for PyCompressConfig { - fn default() -> Self { - Self { - inner: CompressConfig::new().with_enabled(ENCODINGS.iter().cloned()), - } - } -} - #[pyfunction] -#[pyo3(signature = (arr, opts = None))] -pub fn compress( - py: Python<'_>, - arr: &PyArray, - opts: Option, -) -> PyResult> { - let compress_opts = opts.unwrap_or_default().inner; - let ctx = CompressCtx::new(Arc::new(compress_opts)); - let compressed = py - .allow_threads(|| ctx.compress(arr.unwrap(), None)) - .map_err(PyVortexError::map_err)?; +/// Attempt to compress a vortex array. +/// +/// Parameters +/// ---------- +/// array : :class:`vortex.encoding.Array` +/// The array. +/// +/// Examples +/// -------- +/// +/// Compress a very sparse array of integers: +/// +/// >>> a = vortex.encoding.array([42 for _ in range(1000)]) +/// >>> str(vortex.encoding.compress(a)) +/// 'vortex.constant(0x0a)(i64, len=1000)' +/// +/// Compress an array of increasing integers: +/// +/// >>> a = vortex.encoding.array(list(range(1000))) +/// >>> str(vortex.encoding.compress(a)) +/// 'fastlanes.for(0x0f)(i64, len=1000)' +/// +/// Compress an array of increasing floating-point numbers and a few nulls: +/// +/// >>> a = vortex.encoding.array([ +/// ... float(x) if x % 20 != 0 else None +/// ... for x in range(1000) +/// ... ]) +/// >>> str(vortex.encoding.compress(a)) +/// 'vortex.alp(0x0d)(f64?, len=1000)' +pub fn compress<'py>(array: &Bound<'py, PyArray>) -> PyResult> { + let compressor = SamplingCompressor::default(); + let inner = compressor + .compress(&array.borrow().unwrap(), None) + .map_err(PyVortexError::new)? + .into_array(); Bound::new(array.py(), PyArray::new(inner)) } diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs index 548f0cd563..0737f2c0f2 100644 --- a/pyvortex/src/dtype.rs +++ b/pyvortex/src/dtype.rs @@ -1,11 +1,14 @@ use arrow::datatypes::{DataType, Field}; use arrow::pyarrow::FromPyArrow; +use pyo3::exceptions::PyValueError; use pyo3::types::PyType; -use pyo3::{pyclass, pymethods, Bound, Py, PyAny, PyResult, Python}; +use pyo3::{pyclass, pyfunction, pymethods, Bound, Py, PyAny, PyResult, Python}; use vortex::arrow::FromArrowType; -use vortex_dtype::DType; +use vortex_dtype::{DType, PType}; #[pyclass(name = "DType", module = "vortex", subclass)] +/// A data type describes the set of operations available on a given column. These operations are +/// implemented by the column *encoding*. Each data type is implemented by one or more encodings. pub struct PyDType { inner: DType, } @@ -26,8 +29,12 @@ impl PyDType { format!("{}", self.inner) } + fn __repr__(&self) -> String { + format!("{}", self.inner.python_repr()) + } + #[classmethod] - fn from_pyarrow( + fn from_arrow( cls: &Bound, #[pyo3(from_py_with = "import_arrow_dtype")] arrow_dtype: DataType, nullable: bool, @@ -37,8 +44,253 @@ impl PyDType { DType::from_arrow(&Field::new("_", arrow_dtype, nullable)), ) } + + fn maybe_columns(&self) -> Option> { + match &self.inner { + DType::Null => None, + DType::Bool(_) => None, + DType::Primitive(..) => None, + DType::Utf8(_) => None, + DType::Binary(_) => None, + DType::Struct(child, _) => Some(child.names().iter().map(|x| x.to_string()).collect()), + DType::List(..) => None, + DType::Extension(..) => None, + } + } } fn import_arrow_dtype(obj: &Bound) -> PyResult { DataType::from_pyarrow_bound(obj) } + +#[pyfunction(name = "null")] +#[pyo3(signature = ())] +/// Construct the data type for a column containing only the null value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting only :obj:`None`. +/// +/// >>> vortex.dtype.null() +/// null() +pub fn dtype_null(py: Python<'_>) -> PyResult> { + PyDType::wrap(py, DType::Null) +} + +#[pyfunction(name = "bool")] +#[pyo3(signature = (nullable = false))] +/// Construct a Boolean data type. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None`, :obj:`True`, and :obj:`False`. +/// +/// >>> vortex.dtype.bool(True) +/// bool(True) +/// +/// A data type permitting just :obj:`True` and :obj:`False`. +/// +/// >>> vortex.dtype.bool(False) +/// bool(False) +pub fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Bool(nullable.into())) +} + +#[pyfunction(name = "int")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct a signed integral data type. +/// +/// Parameters +/// ---------- +/// width : one of 8, 16, 32, and 64. +/// +/// The bit width determines the span of valid values. If :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` and the integers from -128 to 127, inclusive: +/// +/// >>> vortex.dtype.int(8, True) +/// int(8, True) +/// +/// A data type permitting just the integers from -2,147,483,648 to 2,147,483,647, inclusive: +/// +/// >>> vortex.dtype.int(32, False) +/// int(32, False) +pub fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 8 => DType::Primitive(PType::I8, nullable.into()), + 16 => DType::Primitive(PType::I16, nullable.into()), + 32 => DType::Primitive(PType::I32, nullable.into()), + 64 => DType::Primitive(PType::I64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid int width")), + } + } else { + DType::Primitive(PType::I64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "uint")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct an unsigned integral data type. +/// +/// Parameters +/// ---------- +/// width : one of 8, 16, 32, and 64. +/// +/// The bit width determines the span of valid values. If :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` and the integers from 0 to 255, inclusive: +/// +/// >>> vortex.dtype.uint(8, True) +/// uint(8, True) +/// +/// A data type permitting just the integers from 0 to 4,294,967,296 inclusive: +/// +/// >>> vortex.dtype.uint(32, False) +/// uint(32, False) +pub fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 8 => DType::Primitive(PType::U8, nullable.into()), + 16 => DType::Primitive(PType::U16, nullable.into()), + 32 => DType::Primitive(PType::U32, nullable.into()), + 64 => DType::Primitive(PType::U64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid uint width")), + } + } else { + DType::Primitive(PType::U64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "float")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct an IEEE 754 binary floating-point data type. +/// +/// Parameters +/// ---------- +/// width : one of 16, 32, and 64. +/// +/// The bit width determines the range and precision of the floating-point values. If +/// :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` as well as IEEE 754 binary16 floating-point values. Values +/// larger than 65,520 or less than -65,520 will respectively round to positive and negative +/// infinity. +/// +/// >>> vortex.dtype.float(16, False) +/// float(16, False) +pub fn dtype_float(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 16 => DType::Primitive(PType::F16, nullable.into()), + 32 => DType::Primitive(PType::F32, nullable.into()), + 64 => DType::Primitive(PType::F64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid float width")), + } + } else { + DType::Primitive(PType::F64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "utf8")] +#[pyo3(signature = (nullable = false))] +/// Construct a UTF-8-encoded string data type. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting any UTF-8-encoded string, such as :code:`"Hello World"`, but not +/// permitting :obj:`None`. +/// +/// >>> vortex.dtype.utf8(False) +/// utf8(False) +pub fn dtype_utf8(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Utf8(nullable.into())) +} + +#[pyfunction(name = "binary")] +#[pyo3(signature = (nullable = false))] +/// Construct a data type for binary strings. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting any string of bytes but not permitting :obj:`None`. +/// +/// >>> vortex.dtype.binary(False) +/// binary(False) +pub fn dtype_binary(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Binary(nullable.into())) +} diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index 2cc6adccff..ced01879cc 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -12,12 +12,10 @@ use vortex_dtype::DType; use crate::array::PyArray; use crate::error::PyVortexError; -use crate::vortex_arrow::map_arrow_err; -/// The main entry point for creating enc arrays from other Python objects. -/// +// Private, ergo not documented. #[pyfunction] -pub fn encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { +pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { let pa = obj.py().import_bound("pyarrow")?; let pa_array = pa.getattr("Array")?; let chunked_array = pa.getattr("ChunkedArray")?; @@ -25,7 +23,8 @@ pub fn encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { if obj.is_instance(&pa_array)? { let arrow_array = ArrowArrayData::from_pyarrow_bound(obj).map(make_array)?; - let enc_array = Array::from_arrow(arrow_array, false); + let is_nullable = arrow_array.is_nullable(); + let enc_array = Array::from_arrow(arrow_array, is_nullable); Bound::new(obj.py(), PyArray::new(enc_array)) } else if obj.is_instance(&chunked_array)? { let chunks: Vec> = obj.getattr("chunks")?.extract()?; @@ -54,7 +53,10 @@ pub fn encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { let dtype = DType::from_arrow(array_stream.schema()); let chunks = array_stream .into_iter() - .map(|b| b.map(Array::from).map_err(map_arrow_err)) + .map(|b| { + b.map(Array::from) + .map_err(|e| PyValueError::new_err(e.to_string())) + }) .collect::>>()?; Bound::new( obj.py(), diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs new file mode 100644 index 0000000000..2af04e49f2 --- /dev/null +++ b/pyvortex/src/expr.rs @@ -0,0 +1,293 @@ +use std::sync::Arc; + +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::*; +use vortex_dtype::field::Field; +use vortex_dtype::half::f16; +use vortex_dtype::{DType, Nullability, PType}; +use vortex_expr::{BinaryExpr, Column, Literal, Operator, VortexExpr}; +use vortex_scalar::{PValue, Scalar, ScalarValue}; + +use crate::dtype::PyDType; + +/// An expression describes how to filter rows when reading an array from a file. +/// +/// Examples +/// ======== +/// +/// All the examples read the following file. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// +/// Read only those rows whose age column is greater than 35: +/// +/// >>> e = vortex.io.read("a.vortex", row_filter = vortex.expr.column("age") > 35) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 57 +/// ] +/// -- child 1 type: string +/// [ +/// "Mikhail" +/// ] +/// +/// Read only those rows whose age column lies in (21, 33]. Notice that we must use parentheses +/// because of the Python precedence rules for ``&``: +/// +/// >>> age = vortex.expr.column("age") +/// >>> e = vortex.io.read("a.vortex", row_filter = (age > 21) & (age <= 33)) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph", +/// null +/// ] +/// +/// Read only those rows whose name is `Joseph`: +/// +/// >>> name = vortex.expr.column("name") +/// >>> e = vortex.io.read("a.vortex", row_filter = name == "Joseph") +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph" +/// ] +/// +/// Read rows whose name is `Angela` or whose age is between 20 and 30, inclusive. Notice that the +/// Angela row is excluded because its age is null. The entire row filtering expression therefore +/// evaluates to null which is interpreted as false: +/// +/// >>> name = vortex.expr.column("name") +/// >>> e = vortex.io.read("a.vortex", row_filter = (name == "Angela") | ((age >= 20) & (age <= 30))) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph" +/// ] +#[pyclass(name = "Expr", module = "vortex")] +pub struct PyExpr { + inner: Arc, +} + +impl PyExpr { + pub fn unwrap(&self) -> &Arc { + &self.inner + } +} + +fn py_binary_opeartor<'py>( + left: PyRef<'py, PyExpr>, + operator: Operator, + right: Bound<'py, PyExpr>, +) -> PyResult> { + Bound::new( + left.py(), + PyExpr { + inner: Arc::new(BinaryExpr::new( + left.inner.clone(), + operator, + right.borrow().inner.clone(), + )), + }, + ) +} + +fn coerce_expr<'py>(value: &Bound<'py, PyAny>) -> PyResult> { + let nonnull = Nullability::NonNullable; + if let Ok(value) = value.downcast::() { + Ok(value.clone()) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Null, value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Primitive(PType::I64, nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Primitive(PType::F64, nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Utf8(nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Binary(nonnull), value) + } else { + Err(PyValueError::new_err(format!( + "expected None, int, float, str, or bytes but found: {}", + value + ))) + } +} + +#[pymethods] +impl PyExpr { + fn __eq__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Eq, coerce_expr(right)?) + } + + fn __neq__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::NotEq, coerce_expr(right)?) + } + + fn __gt__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Gt, coerce_expr(right)?) + } + + fn __ge__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Gte, coerce_expr(right)?) + } + + fn __lt__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Lt, coerce_expr(right)?) + } + + fn __le__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Lte, coerce_expr(right)?) + } + + fn __and__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::And, coerce_expr(right)?) + } + + fn __or__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Or, coerce_expr(right)?) + } +} + +/// A named column. +/// +/// See :class:`.Expr` for more examples. +/// +/// Example +/// ======= +/// +/// A filter that selects only those rows whose name is `Joseph`: +/// +/// >>> name = vortex.expr.column("name") +/// >>> filter = name == "Joseph" +/// +#[pyfunction] +pub fn column<'py>(name: &Bound<'py, PyString>) -> PyResult> { + let py = name.py(); + let name: String = name.extract()?; + Bound::new( + py, + PyExpr { + inner: Arc::new(Column::new(Field::Name(name))), + }, + ) +} + +#[pyfunction] +pub fn _literal<'py>( + dtype: &Bound<'py, PyDType>, + value: &Bound<'py, PyAny>, +) -> PyResult> { + scalar(dtype.borrow().unwrap().clone(), value) +} + +pub fn scalar<'py>(dtype: DType, value: &Bound<'py, PyAny>) -> PyResult> { + let py = value.py(); + Bound::new( + py, + PyExpr { + inner: Arc::new(Literal::new(Scalar::new( + dtype.clone(), + scalar_value(dtype, value)?, + ))), + }, + ) +} + +pub fn scalar_value(dtype: DType, value: &Bound<'_, PyAny>) -> PyResult { + match dtype { + DType::Null => { + value.downcast::()?; + Ok(ScalarValue::Null) + } + DType::Bool(_) => { + let value = value.downcast::()?; + Ok(ScalarValue::Bool(value.extract()?)) + } + DType::Primitive(ptype, _) => { + let pvalue = match ptype { + PType::I8 => PValue::I8(value.extract()?), + PType::I16 => PValue::I16(value.extract()?), + PType::I32 => PValue::I32(value.extract()?), + PType::I64 => PValue::I64(value.extract()?), + PType::U8 => PValue::U8(value.extract()?), + PType::U16 => PValue::U16(value.extract()?), + PType::U32 => PValue::U32(value.extract()?), + PType::U64 => PValue::U64(value.extract()?), + PType::F16 => { + let float = value.extract::()?; + PValue::F16(f16::from_f32(float)) + } + PType::F32 => PValue::F32(value.extract()?), + PType::F64 => PValue::F64(value.extract()?), + }; + Ok(ScalarValue::Primitive(pvalue)) + } + DType::Utf8(_) => Ok(ScalarValue::BufferString(value.extract::()?.into())), + DType::Binary(_) => Ok(ScalarValue::Buffer(value.extract::<&[u8]>()?.into())), + DType::Struct(..) => todo!(), + DType::List(element_type, _) => { + let list = value.downcast::(); + let values: Vec = list + .iter() + .map(|element| scalar_value(element_type.as_ref().clone(), element)) + .collect::>>()?; + Ok(ScalarValue::List(Arc::from(values.into_boxed_slice()))) + } + DType::Extension(..) => todo!(), + } +} diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs new file mode 100644 index 0000000000..0a1320722d --- /dev/null +++ b/pyvortex/src/io.rs @@ -0,0 +1,255 @@ +use std::path::Path; + +use futures::StreamExt; +use pyo3::exceptions::PyTypeError; +use pyo3::prelude::*; +use pyo3::pyfunction; +use pyo3::types::{PyList, PyLong, PyString}; +use tokio::fs::File; +use vortex::array::ChunkedArray; +use vortex::{Array, Context}; +use vortex_dtype::field::Field; +use vortex_error::VortexResult; +use vortex_serde::io::TokioAdapter; +use vortex_serde::layouts::{ + LayoutContext, LayoutDeserializer, LayoutReaderBuilder, LayoutWriter, Projection, RowFilter, +}; + +use crate::error::PyVortexError; +use crate::expr::PyExpr; +use crate::PyArray; + +/// Read a vortex struct array from the local filesystem. +/// +/// Parameters +/// ---------- +/// f : :class:`str` +/// +/// The file path. +/// +/// Examples +/// -------- +/// +/// Read an array with a structured column and nulls at multiple levels and in multiple columns. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// >>> b = vortex.io.read("a.vortex") +/// >>> b.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31, +/// null, +/// 57, +/// null +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph", +/// null, +/// "Angela", +/// "Mikhail", +/// null +/// ] +/// +/// Read just the age column: +/// +/// >>> c = vortex.io.read("a.vortex", projection = ["age"]) +/// >>> c.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31, +/// null, +/// 57, +/// null +/// ] +/// +/// Read just the name column, by its index: +/// +/// >>> d = vortex.io.read("a.vortex", projection = [1]) +/// >>> d.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: string +/// [ +/// "Joseph", +/// null, +/// "Angela", +/// "Mikhail", +/// null +/// ] +/// +/// +/// Keep rows with an age above 35. This will read O(N_KEPT) rows, when the file format allows. +/// +/// >>> e = vortex.io.read("a.vortex", row_filter = vortex.expr.column("age") > 35) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 57 +/// ] +/// -- child 1 type: string +/// [ +/// "Mikhail" +/// ] +/// +/// TODO(DK): Repeating a column in a projection does not work +/// +/// Read the age column by name, twice, and the name column by index, once: +/// +/// >>> # e = vortex.io.read("a.vortex", projection = ["age", 1, "age"]) +/// >>> # e.to_arrow() +/// +/// TODO(DK): Top-level nullness does not work. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... None, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// >>> b = vortex.io.read("a.vortex") +/// >>> # b.to_arrow() +/// +#[pyfunction] +#[pyo3(signature = (f, projection = None, row_filter = None))] +pub fn read<'py>( + f: &Bound<'py, PyString>, + projection: Option<&Bound<'py, PyAny>>, + row_filter: Option<&Bound<'py, PyExpr>>, +) -> PyResult> { + async fn run( + fname: &str, + projection: Projection, + row_filter: Option, + ) -> VortexResult { + let file = File::open(Path::new(fname)).await?; + + let mut builder: LayoutReaderBuilder> = LayoutReaderBuilder::new( + TokioAdapter(file), // TODO(dk): Why didn't we implement this on File directly? + LayoutDeserializer::new(Context::default().into(), LayoutContext::default().into()), + ) + .with_projection(projection); + + if let Some(row_filter) = row_filter { + builder = builder.with_row_filter(row_filter); + } + + let stream = builder.build().await?; + + let dtype = stream.schema().into_dtype(); + + let vecs = stream + .collect::>>() + .await + .into_iter() // TODO(dk) unclear why I need two collects to pacify the compiler + .collect::>>()?; + + if vecs.len() == 1 { + Ok(vecs.into_iter().next().unwrap()) + } else { + ChunkedArray::try_new(vecs, dtype).map(|e| e.into()) + } + } + + let fname = f.to_str()?; // TODO(dk): support file objects + + let projection = match projection { + None => Projection::All, + Some(projection) => { + let list: &Bound<'py, PyList> = projection.downcast()?; + Projection::Flat( + list.iter() + .map(|field| -> PyResult { + if field.clone().is_instance_of::() { + Ok(Field::Name( + field.downcast::()?.to_str()?.to_string(), + )) + } else if field.is_instance_of::() { + Ok(Field::Index(field.extract()?)) + } else { + Err(PyTypeError::new_err(format!( + "projection: expected list of string, int, and None, but found: {}.", + field, + ))) + } + }) + .collect::>>()?, + ) + } + }; + + let row_filter = row_filter.map(|x| RowFilter::new(x.borrow().unwrap().clone())); + + let inner = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()? + .block_on(run(fname, projection, row_filter)) + .map_err(PyVortexError::new)?; + + Bound::new(f.py(), PyArray::new(inner)) +} + +#[pyfunction] +/// Write a vortex struct array to the local filesystem. +/// +/// Parameters +/// ---------- +/// array : :class:`vortex.encoding.Array` +/// +/// The array. Must be an array of structures. +/// +/// f : :class:`str` +/// +/// The file path. +/// +/// Examples +/// -------- +/// +/// Write the array `a` to the local file `a.vortex`. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'x': 1}, +/// ... {'x': 2}, +/// ... {'x': 10}, +/// ... {'x': 11}, +/// ... {'x': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// +pub fn write(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResult<()> { + async fn run(array: &Array, fname: &str) -> VortexResult<()> { + let file = File::create(Path::new(fname)).await?; + let mut writer = LayoutWriter::new(file); + + writer = writer.write_array_columns(array.clone()).await?; + writer.finalize().await?; + Ok(()) + } + + let fname = f.to_str()?; // TODO(dk): support file objects + let array = array.borrow().unwrap().clone(); + + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()? + .block_on(run(&array, fname)) + .map_err(PyVortexError::map_err) +} diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index 8da6147c1d..d9895c3a22 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -1,93 +1,53 @@ #![allow(unsafe_op_in_unsafe_fn)] -use dtype::PyDType; -use pyo3::exceptions::PyValueError; +use array::PyArray; +use expr::PyExpr; use pyo3::prelude::*; -use vortex_dtype::{DType, PType}; mod array; +mod compress; mod dtype; mod encode; mod error; -mod vortex_arrow; +mod expr; +mod io; -/// A Python module implemented in Rust. +/// Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. #[pymodule] -fn _lib(_py: Python, m: &Bound) -> PyResult<()> { +fn _lib(py: Python, m: &Bound) -> PyResult<()> { pyo3_log::init(); - m.add_function(wrap_pyfunction!(encode::encode, m)?)?; - // m.add_function(wrap_pyfunction!(compress::compress, m)?)?; + let dtype = PyModule::new_bound(py, "dtype")?; + m.add_submodule(&dtype)?; - m.add_class::()?; + dtype.add_class::()?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_null, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_bool, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_int, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_uint, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_float, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_utf8, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_binary, m)?)?; - m.add_function(wrap_pyfunction!(dtype_int, m)?)?; - m.add_function(wrap_pyfunction!(dtype_uint, m)?)?; - m.add_function(wrap_pyfunction!(dtype_float, m)?)?; - m.add_function(wrap_pyfunction!(dtype_bool, m)?)?; - m.add_function(wrap_pyfunction!(dtype_utf8, m)?)?; + let encoding = PyModule::new_bound(py, "encoding")?; + m.add_submodule(&encoding)?; - Ok(()) -} + encoding.add_function(wrap_pyfunction!(encode::_encode, m)?)?; + encoding.add_function(wrap_pyfunction!(compress::compress, m)?)?; -#[pyfunction(name = "bool")] -#[pyo3(signature = (nullable = false))] -fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> { - PyDType::wrap(py, DType::Bool(nullable.into())) -} + encoding.add_class::()?; -#[pyfunction(name = "int")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 8 => DType::Primitive(PType::I8, nullable.into()), - 16 => DType::Primitive(PType::I16, nullable.into()), - 32 => DType::Primitive(PType::I32, nullable.into()), - 64 => DType::Primitive(PType::I64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid int width")), - } - } else { - DType::Primitive(PType::I64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + let io = PyModule::new_bound(py, "io")?; + m.add_submodule(&io)?; -#[pyfunction(name = "uint")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 8 => DType::Primitive(PType::U8, nullable.into()), - 16 => DType::Primitive(PType::U16, nullable.into()), - 32 => DType::Primitive(PType::U32, nullable.into()), - 64 => DType::Primitive(PType::U64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid uint width")), - } - } else { - DType::Primitive(PType::U64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + io.add_function(wrap_pyfunction!(io::read, m)?)?; + io.add_function(wrap_pyfunction!(io::write, m)?)?; -#[pyfunction(name = "float")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_float(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 16 => DType::Primitive(PType::F16, nullable.into()), - 32 => DType::Primitive(PType::F32, nullable.into()), - 64 => DType::Primitive(PType::F64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid float width")), - } - } else { - DType::Primitive(PType::F64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + let expr = PyModule::new_bound(py, "expr")?; + m.add_submodule(&expr)?; -#[pyfunction(name = "utf8")] -#[pyo3(signature = (nullable = false))] -fn dtype_utf8(py: Python<'_>, nullable: bool) -> PyResult> { - PyDType::wrap(py, DType::Utf8(nullable.into())) + expr.add_function(wrap_pyfunction!(expr::column, m)?)?; + expr.add_class::()?; + + Ok(()) } diff --git a/pyvortex/src/vortex_arrow.rs b/pyvortex/src/vortex_arrow.rs deleted file mode 100644 index d74f049631..0000000000 --- a/pyvortex/src/vortex_arrow.rs +++ /dev/null @@ -1,48 +0,0 @@ -use arrow::array::{Array as ArrowArray, ArrayRef}; -use arrow::error::ArrowError; -use arrow::pyarrow::ToPyArrow; -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; -use pyo3::types::{IntoPyDict, PyList}; -use vortex::array::ChunkedArray; -use vortex::{Array, IntoCanonical}; - -pub fn map_arrow_err(error: ArrowError) -> PyErr { - PyValueError::new_err(error.to_string()) -} - -pub fn export_array<'py>(py: Python<'py>, array: &Array) -> PyResult> { - // NOTE(ngates): for struct arrays, we could also return a RecordBatchStreamReader. - // NOTE(robert): Return RecordBatchStreamReader always? - let chunks: Vec = if let Ok(chunked_array) = ChunkedArray::try_from(array) { - chunked_array - .chunks() - .map(|chunk| chunk.into_canonical().unwrap().into_arrow()) - .collect() - } else { - vec![array.clone().into_canonical().unwrap().into_arrow()] - }; - if chunks.is_empty() { - return Err(PyValueError::new_err("No chunks in array")); - } - - // Export the schema once - let data_type = chunks[0].data_type().clone(); - let pa_data_type = data_type.to_pyarrow(py)?; - - // Iterate each chunk, export it to Arrow FFI, then import as a pyarrow array - let chunks: PyResult> = chunks - .iter() - .map(|arrow_array| arrow_array.into_data().to_pyarrow(py)) - .collect(); - - // Import pyarrow and its Array class - let mod_pyarrow = PyModule::import_bound(py, "pyarrow")?; - - // Combine into a chunked array - mod_pyarrow.call_method( - "chunked_array", - (PyList::new_bound(py, chunks?),), - Some(&[("type", pa_data_type)].into_py_dict_bound(py)), - ) -} diff --git a/pyvortex/test/test_array.py b/pyvortex/test/test_array.py index db93368baf..803f4996f7 100644 --- a/pyvortex/test/test_array.py +++ b/pyvortex/test/test_array.py @@ -4,19 +4,19 @@ def test_primitive_array_round_trip(): a = pa.array([0, 1, 2, 3]) - arr = vortex.encode(a) - assert arr.to_arrow().combine_chunks() == a + arr = vortex.array(a) + assert arr.to_arrow() == a def test_varbin_array_round_trip(): a = pa.array(["a", "b", "c"]) - arr = vortex.encode(a) - assert arr.to_arrow().combine_chunks() == a + arr = vortex.array(a) + assert arr.to_arrow() == a def test_varbin_array_take(): - a = vortex.encode(pa.array(["a", "b", "c", "d"])) - assert a.take(vortex.encode(pa.array([0, 2]))).to_arrow().combine_chunks() == pa.array( + a = vortex.array(pa.array(["a", "b", "c", "d"])) + assert a.take(vortex.array(pa.array([0, 2]))).to_arrow() == pa.array( ["a", "c"], type=pa.utf8(), ) @@ -24,5 +24,5 @@ def test_varbin_array_take(): def test_empty_array(): a = pa.array([], type=pa.uint8()) - primitive = vortex.encode(a) + primitive = vortex.array(a) assert primitive.to_arrow().type == pa.uint8() diff --git a/pyvortex/test/test_compress.py b/pyvortex/test/test_compress.py index 368e7c488b..0b089a5899 100644 --- a/pyvortex/test/test_compress.py +++ b/pyvortex/test/test_compress.py @@ -11,54 +11,54 @@ @pytest.mark.xfail(reason="Not yet implemented") def test_primitive_compress(): a = pa.array([0, 0, 0, 0, 9, 9, 9, 9, 1, 5]) - arr_compressed = vortex.compress(vortex.encode(a)) - assert not isinstance(arr_compressed, vortex.PrimitiveArray) + arr_compressed = vortex.compress(vortex.array(a)) + assert not isinstance(arr_compressed, vortex.encoding.PrimitiveArray) assert arr_compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_for_compress(): a = pa.array(np.arange(10_000) + 10_000_000) - arr_compressed = vortex.compress(vortex.encode(a)) - assert not isinstance(arr_compressed, vortex.PrimitiveArray) + arr_compressed = vortex.compress(vortex.array(a)) + assert not isinstance(arr_compressed, vortex.encoding.PrimitiveArray) @pytest.mark.xfail(reason="Not yet implemented") def test_bool_compress(): - a = vortex.encode(pa.array([False] * 10_000 + [True] * 10_000)) + a = vortex.array(pa.array([False] * 10_000 + [True] * 10_000)) arr_compressed = vortex.compress(a) assert len(arr_compressed) == 20_000 - assert isinstance(arr_compressed, vortex.RoaringBoolArray) + assert isinstance(arr_compressed, vortex.encoding.RoaringBoolArray) assert arr_compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_roaring_bool_encode(): - a = vortex.encode(pa.array([True] * 10_000)) - rarr = vortex.RoaringBoolArray.encode(a) - assert isinstance(rarr, vortex.RoaringBoolArray) + a = vortex.array(pa.array([True] * 10_000)) + rarr = vortex.encoding.RoaringBoolArray.encode(a) + assert isinstance(rarr, vortex.encoding.RoaringBoolArray) assert rarr.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_arange_encode(): - a = vortex.encode(pa.array(np.arange(10_000), type=pa.uint32())) + a = vortex.array(pa.array(np.arange(10_000), type=pa.uint32())) compressed = vortex.compress(a) - assert isinstance(compressed, vortex.DeltaArray) or isinstance(compressed, vortex.RoaringIntArray) + assert isinstance(compressed, vortex.encoding.DeltaArray) or isinstance(compressed, vortex.encoding.RoaringIntArray) assert compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_zigzag_encode(): - a = vortex.encode(pa.array([-1, -1, 0, -1, 1, -1])) - zarr = vortex.ZigZagArray.encode(a) - assert isinstance(zarr, vortex.ZigZagArray) + a = vortex.array(pa.array([-1, -1, 0, -1, 1, -1])) + zarr = vortex.encoding.ZigZagArray.encode(a) + assert isinstance(zarr, vortex.encoding.ZigZagArray) # TODO(ngates): support decoding once we have decompressor. def test_chunked_encode(): chunked = pa.chunked_array([pa.array([0, 1, 2]), pa.array([3, 4, 5])]) - encoded = vortex.encode(chunked) + encoded = vortex.array(chunked) assert encoded.to_arrow().combine_chunks() == pa.array([0, 1, 2, 3, 4, 5]) @@ -69,7 +69,7 @@ def test_table_encode(): "string": pa.chunked_array([pa.array(["a", "b", "c"]), pa.array(["d", "e", "f"])]), } ) - encoded = vortex.encode(table) + encoded = vortex.array(table) assert encoded.to_arrow().combine_chunks() == pa.StructArray.from_arrays( [pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"])], names=["number", "string"] ) @@ -79,6 +79,6 @@ def test_table_encode(): def test_taxi(): curdir = Path(os.path.dirname(__file__)).parent.parent table = pq.read_table(curdir / "bench-vortex/data/yellow-tripdata-2023-11.parquet") - compressed = vortex.compress(vortex.encode(table[:100])) + compressed = vortex.compress(vortex.array(table[:100])) decompressed = compressed.to_arrow() assert not decompressed diff --git a/pyvortex/test/test_dtype.py b/pyvortex/test/test_dtype.py index 207a8ebc53..8b5deaaf0b 100644 --- a/pyvortex/test/test_dtype.py +++ b/pyvortex/test/test_dtype.py @@ -2,9 +2,9 @@ def test_int(): - assert str(vortex.int()) == "i64" - assert str(vortex.int(32)) == "i32" - assert str(vortex.int(32, nullable=True)) == "i32?" - assert str(vortex.uint(32)) == "u32" - assert str(vortex.float(16)) == "f16" - assert str(vortex.bool(nullable=True)) == "bool?" + assert str(vortex.dtype.int()) == "int(64, False)" + assert str(vortex.dtype.int(32)) == "int(32, False)" + assert str(vortex.dtype.int(32, nullable=True)) == "int(32, True)" + assert str(vortex.dtype.uint(32)) == "uint(32, False)" + assert str(vortex.dtype.float(16)) == "float(16, False)" + assert str(vortex.dtype.bool(nullable=True)) == "bool(True)" diff --git a/requirements-dev.lock b/requirements-dev.lock index ece7184320..686f367bc6 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -11,8 +11,18 @@ -e file:. -e file:pyvortex +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +asttokens==2.4.1 + # via stack-data babel==2.14.0 # via mkdocs-material + # via pydata-sphinx-theme + # via sphinx +beautifulsoup4==4.12.3 + # via pydata-sphinx-theme bracex==2.4 # via wcmatch certifi==2024.2.2 @@ -23,20 +33,33 @@ click==8.1.7 # via mkdocs colorama==0.4.6 # via mkdocs-material +decorator==5.1.1 + # via ipython +docutils==0.21.2 + # via pydata-sphinx-theme + # via sphinx +executing==2.0.1 + # via stack-data ghp-import==2.1.0 # via mkdocs idna==3.6 # via requests +imagesize==1.4.1 + # via sphinx importlib-metadata==7.1.0 # via mike importlib-resources==6.4.0 # via mike iniconfig==2.0.0 # via pytest +ipython==8.26.0 +jedi==0.19.1 + # via ipython jinja2==3.1.3 # via mike # via mkdocs # via mkdocs-material + # via sphinx markdown==3.6 # via mkdocs # via mkdocs-material @@ -44,6 +67,8 @@ markdown==3.6 markupsafe==2.1.5 # via jinja2 # via mkdocs +matplotlib-inline==0.1.7 + # via ipython maturin==1.5.1 mergedeep==1.3.4 # via mkdocs @@ -57,24 +82,44 @@ mkdocs-material==9.5.17 mkdocs-material-extensions==1.3.1 # via mkdocs-material numpy==1.26.4 + # via pandas # via pyarrow packaging==24.0 # via mkdocs + # via pydata-sphinx-theme # via pytest + # via sphinx paginate==0.5.6 # via mkdocs-material +pandas==2.2.2 +parso==0.8.4 + # via jedi pathspec==0.12.1 # via mkdocs +pexpect==4.9.0 + # via ipython pip==24.0 platformdirs==4.2.0 # via mkdocs pluggy==1.4.0 # via pytest +prompt-toolkit==3.0.47 + # via ipython +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data py-cpuinfo==9.0.0 # via pytest-benchmark pyarrow==15.0.2 +pydata-sphinx-theme==0.15.4 + # via vortex pygments==2.17.2 + # via accessible-pygments + # via ipython # via mkdocs-material + # via pydata-sphinx-theme + # via sphinx pymdown-extensions==10.7.1 # via mkdocs-material pyparsing==3.1.2 @@ -84,6 +129,9 @@ pytest==8.1.1 pytest-benchmark==4.0.0 python-dateutil==2.9.0.post0 # via ghp-import + # via pandas +pytz==2024.1 + # via pandas pyyaml==6.0.1 # via mike # via mkdocs @@ -95,9 +143,39 @@ regex==2023.12.25 # via mkdocs-material requests==2.31.0 # via mkdocs-material + # via sphinx ruff==0.3.5 six==1.16.0 + # via asttokens # via python-dateutil +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.6 + # via beautifulsoup4 +sphinx==8.0.2 + # via pydata-sphinx-theme +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +stack-data==0.6.3 + # via ipython +traitlets==5.14.3 + # via ipython + # via matplotlib-inline +typing-extensions==4.12.2 + # via ipython + # via pydata-sphinx-theme +tzdata==2024.1 + # via pandas urllib3==2.2.1 # via requests verspec==0.1.0 @@ -106,5 +184,7 @@ watchdog==4.0.0 # via mkdocs wcmatch==8.5.1 # via mkdocs-include-markdown-plugin +wcwidth==0.2.13 + # via prompt-toolkit zipp==3.18.1 # via importlib-metadata diff --git a/requirements.lock b/requirements.lock index 64a44c3203..12b45467d6 100644 --- a/requirements.lock +++ b/requirements.lock @@ -11,3 +11,60 @@ -e file:. -e file:pyvortex +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +babel==2.16.0 + # via pydata-sphinx-theme + # via sphinx +beautifulsoup4==4.12.3 + # via pydata-sphinx-theme +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +docutils==0.21.2 + # via pydata-sphinx-theme + # via sphinx +idna==3.8 + # via requests +imagesize==1.4.1 + # via sphinx +jinja2==3.1.4 + # via sphinx +markupsafe==2.1.5 + # via jinja2 +packaging==24.1 + # via pydata-sphinx-theme + # via sphinx +pydata-sphinx-theme==0.15.4 + # via vortex +pygments==2.18.0 + # via accessible-pygments + # via pydata-sphinx-theme + # via sphinx +requests==2.32.3 + # via sphinx +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.6 + # via beautifulsoup4 +sphinx==8.0.2 + # via pydata-sphinx-theme +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +typing-extensions==4.12.2 + # via pydata-sphinx-theme +urllib3==2.2.2 + # via requests diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs index d2e6e05d9c..5d32c4899a 100644 --- a/vortex-array/src/array/struct_/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; use vortex_dtype::field::Field; -use vortex_dtype::{DType, FieldName, FieldNames, Nullability, StructDType}; +use vortex_dtype::{DType, FieldName, FieldNames, StructDType}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use crate::stats::{ArrayStatisticsCompute, StatsSet}; @@ -55,14 +55,14 @@ impl StructArray { let mut children = Vec::with_capacity(fields.len() + 1); children.extend(fields); - if let Some(v) = validity.into_array() { + if let Some(v) = validity.clone().into_array() { children.push(v); } Self::try_from_parts( DType::Struct( StructDType::new(names, field_dtypes), - Nullability::NonNullable, + validity.nullability(), ), length, StructMetadata { diff --git a/vortex-array/src/arrow/dtype.rs b/vortex-array/src/arrow/dtype.rs index 6398e6e37e..b71554bb2b 100644 --- a/vortex-array/src/arrow/dtype.rs +++ b/vortex-array/src/arrow/dtype.rs @@ -46,7 +46,7 @@ impl FromArrowType for DType { .map(|f| Self::from_arrow(f.as_ref())) .collect_vec(), ), - Nullability::NonNullable, + Nullability::NonNullable, // Must match From for Array ) } } diff --git a/vortex-array/src/arrow/recordbatch.rs b/vortex-array/src/arrow/recordbatch.rs index 020988890e..68ce12b5f5 100644 --- a/vortex-array/src/arrow/recordbatch.rs +++ b/vortex-array/src/arrow/recordbatch.rs @@ -24,7 +24,7 @@ impl From for Array { .map(|(array, field)| Array::from_arrow(array.clone(), field.is_nullable())) .collect(), value.num_rows(), - Validity::AllValid, + Validity::NonNullable, // Must match FromArrowType for DType ) .unwrap() .into() diff --git a/vortex-array/src/compute/take.rs b/vortex-array/src/compute/take.rs index 6f4136a021..dcab99d1a2 100644 --- a/vortex-array/src/compute/take.rs +++ b/vortex-array/src/compute/take.rs @@ -1,7 +1,7 @@ use log::info; -use vortex_error::{vortex_err, VortexResult}; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::{Array, IntoCanonical}; +use crate::{Array, ArrayDType, IntoCanonical}; pub trait TakeFn { fn take(&self, indices: &Array) -> VortexResult; @@ -10,6 +10,10 @@ pub trait TakeFn { pub fn take(array: &Array, indices: &Array) -> VortexResult { array.with_dyn(|a| { if let Some(take) = a.take() { + if !indices.dtype().is_int() { + vortex_bail!(InvalidArgument: "indices: expected int or uint array, but found: {}", indices.dtype().python_repr()); + } + return take.take(indices); } diff --git a/vortex-dtype/src/dtype.rs b/vortex-dtype/src/dtype.rs index c513604962..97450d367a 100644 --- a/vortex-dtype/src/dtype.rs +++ b/vortex-dtype/src/dtype.rs @@ -120,6 +120,10 @@ impl DType { _ => None, } } + + pub fn python_repr(&self) -> DTypePythonRepr { + DTypePythonRepr { dtype: self } + } } impl Display for DType { @@ -154,6 +158,51 @@ impl Display for DType { } } +pub struct DTypePythonRepr<'a> { + dtype: &'a DType, +} + +impl Display for DTypePythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.dtype { + Null => write!(f, "null()"), + Bool(n) => write!(f, "bool({})", n.python_repr()), + Primitive(p, n) => match p { + PType::U8 | PType::U16 | PType::U32 | PType::U64 => { + write!(f, "uint({}, {})", p.bit_width(), n.python_repr()) + } + PType::I8 | PType::I16 | PType::I32 | PType::I64 => { + write!(f, "int({}, {})", p.bit_width(), n.python_repr()) + } + PType::F16 | PType::F32 | PType::F64 => { + write!(f, "float({}, {})", p.bit_width(), n.python_repr()) + } + }, + Utf8(n) => write!(f, "utf8({})", n.python_repr()), + Binary(n) => write!(f, "binary({})", n.python_repr()), + Struct(st, n) => write!( + f, + "struct({{{}}}, {})", + st.names() + .iter() + .zip(st.dtypes().iter()) + .map(|(n, dt)| format!("\"{}\": {}", n, dt.python_repr())) + .join(", "), + n.python_repr() + ), + List(c, n) => write!(f, "list({}, {})", c.python_repr(), n.python_repr()), + Extension(ext, n) => { + write!(f, "ext(\"{}\", ", ext.id().python_repr())?; + match ext.metadata() { + None => write!(f, "None")?, + Some(metadata) => write!(f, "{}", metadata.python_repr())?, + }; + write!(f, ", {})", n.python_repr()) + } + } + } +} + #[derive(Debug, Clone, PartialOrd, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct StructDType { diff --git a/vortex-dtype/src/extension.rs b/vortex-dtype/src/extension.rs index 18daec5505..6651d97fe5 100644 --- a/vortex-dtype/src/extension.rs +++ b/vortex-dtype/src/extension.rs @@ -9,6 +9,10 @@ impl ExtID { pub fn new(value: Arc) -> Self { Self(value) } + + pub fn python_repr(&self) -> ExtIDPythonRepr { + ExtIDPythonRepr { ext_id: self } + } } impl Display for ExtID { @@ -29,6 +33,18 @@ impl From<&str> for ExtID { } } +pub struct ExtIDPythonRepr<'a> { + ext_id: &'a ExtID, +} + +impl Display for ExtIDPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.ext_id { + ExtID(id) => write!(f, "\"{}\"", id.escape_default()), + } + } +} + #[derive(Debug, Clone, PartialOrd, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct ExtMetadata(Arc<[u8]>); @@ -37,6 +53,10 @@ impl ExtMetadata { pub fn new(value: Arc<[u8]>) -> Self { Self(value) } + + pub fn python_repr(&self) -> ExtMetadataPythonRepr { + ExtMetadataPythonRepr { ext_metadata: self } + } } impl AsRef<[u8]> for ExtMetadata { @@ -51,6 +71,18 @@ impl From<&[u8]> for ExtMetadata { } } +pub struct ExtMetadataPythonRepr<'a> { + ext_metadata: &'a ExtMetadata, +} + +impl Display for ExtMetadataPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.ext_metadata { + ExtMetadata(metadata) => write!(f, "\"{}\"", metadata.escape_ascii()), + } + } +} + #[derive(Debug, Clone, PartialOrd, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct ExtDType { diff --git a/vortex-dtype/src/nullability.rs b/vortex-dtype/src/nullability.rs index 14a33dc930..e9decc31cd 100644 --- a/vortex-dtype/src/nullability.rs +++ b/vortex-dtype/src/nullability.rs @@ -7,6 +7,12 @@ pub enum Nullability { Nullable, } +impl Nullability { + pub fn python_repr(&self) -> NullabilityPythonRepr { + NullabilityPythonRepr { nullability: self } + } +} + impl From for Nullability { fn from(value: bool) -> Self { if value { @@ -34,3 +40,16 @@ impl Display for Nullability { } } } + +pub struct NullabilityPythonRepr<'a> { + nullability: &'a Nullability, +} + +impl Display for NullabilityPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.nullability { + Nullability::NonNullable => write!(f, "False"), + Nullability::Nullable => write!(f, "True"), + } + } +}