diff --git a/Cargo.lock b/Cargo.lock index 3e5a63e6fe..3048a968ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3090,18 +3090,26 @@ name = "pyvortex" version = "0.7.0" dependencies = [ "arrow", + "flexbuffers", + "futures", + "itertools 0.13.0", "log", "paste", "pyo3", "pyo3-log", + "tokio", "vortex-alp", "vortex-array", "vortex-dict", "vortex-dtype", "vortex-error", + "vortex-expr", "vortex-fastlanes", "vortex-roaring", "vortex-runend", + "vortex-sampling-compressor", + "vortex-scalar", + "vortex-serde", "vortex-zigzag", ] diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 6b17782f96..3368ca9290 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -22,19 +22,27 @@ doctest = false [dependencies] arrow = { workspace = true, features = ["pyarrow"] } +flexbuffers = { workspace = true } +futures = { workspace = true } log = { workspace = true } paste = { workspace = true } pyo3 = { workspace = true } pyo3-log = { workspace = true } +tokio = { workspace = true, features = ["fs"] } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-dict = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } +vortex-expr = { workspace = true } vortex-fastlanes = { workspace = true } vortex-roaring = { workspace = true } vortex-runend = { workspace = true } +vortex-sampling-compressor = { workspace = true } +vortex-serde = { workspace = true, features = ["tokio"] } +vortex-scalar = { workspace = true } vortex-zigzag = { workspace = true } +itertools = { workspace = true } # We may need this workaround? # https://pyo3.rs/v0.20.2/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror diff --git a/pyvortex/docs/dtype.rst b/pyvortex/docs/dtype.rst new file mode 100644 index 0000000000..9c30bc80b9 --- /dev/null +++ b/pyvortex/docs/dtype.rst @@ -0,0 +1,7 @@ +Array Data Types +================ + +.. automodule:: vortex.dtype + :members: + :imported-members: + diff --git a/pyvortex/docs/encoding.rst b/pyvortex/docs/encoding.rst new file mode 100644 index 0000000000..8448777fba --- /dev/null +++ b/pyvortex/docs/encoding.rst @@ -0,0 +1,7 @@ +Arrays +====== + +.. automodule:: vortex.encoding + :members: + :imported-members: + :special-members: __len__ diff --git a/pyvortex/docs/expr.rst b/pyvortex/docs/expr.rst new file mode 100644 index 0000000000..854aec35ce --- /dev/null +++ b/pyvortex/docs/expr.rst @@ -0,0 +1,6 @@ +Row Filter Expressions +====================== + +.. automodule:: vortex.expr + :members: + :imported-members: diff --git a/pyvortex/docs/index.rst b/pyvortex/docs/index.rst index ddd43736d3..3fba86d504 100644 --- a/pyvortex/docs/index.rst +++ b/pyvortex/docs/index.rst @@ -6,9 +6,13 @@ Vortex documentation ==================== -.. automodule:: vortex - :members: +Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. .. toctree:: :maxdepth: 2 :caption: Contents: + + encoding + dtype + io + expr diff --git a/pyvortex/docs/io.rst b/pyvortex/docs/io.rst new file mode 100644 index 0000000000..f2cc405ce9 --- /dev/null +++ b/pyvortex/docs/io.rst @@ -0,0 +1,6 @@ +Input and Output +================ + +.. automodule:: vortex.io + :members: + :imported-members: diff --git a/pyvortex/pyproject.toml b/pyvortex/pyproject.toml index f472a8dbb7..aa5dfd129a 100644 --- a/pyvortex/pyproject.toml +++ b/pyvortex/pyproject.toml @@ -5,7 +5,9 @@ description = "Add your description here" authors = [ { name = "Nicholas Gates", email = "nick@nickgates.com" } ] -dependencies = [] +dependencies = [ + "pydata-sphinx-theme>=0.15.4", +] requires-python = ">= 3.11" classifiers = ["Private :: Do Not Upload"] @@ -17,7 +19,10 @@ build-backend = "maturin" managed = true dev-dependencies = [ "pyarrow>=15.0.0", - "pip" + "pip", + "sphinx>=8.0.2", + "ipython>=8.26.0", + "pandas>=2.2.2", ] [tool.maturin] diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py index 5e75e016eb..f6ec061509 100644 --- a/pyvortex/python/vortex/__init__.py +++ b/pyvortex/python/vortex/__init__.py @@ -1,4 +1,9 @@ -from ._lib import * # noqa: F403 +from . import encoding from ._lib import __doc__ as module_docs +from ._lib import dtype, expr, io __doc__ = module_docs +del module_docs +array = encoding.array + +__all__ = ["array", dtype, expr, io, encoding] diff --git a/pyvortex/python/vortex/encoding.py b/pyvortex/python/vortex/encoding.py new file mode 100644 index 0000000000..27900d5bdb --- /dev/null +++ b/pyvortex/python/vortex/encoding.py @@ -0,0 +1,166 @@ +import pyarrow + +from ._lib import encoding as _encoding + +__doc__ = _encoding.__doc__ + +Array = _encoding.Array +compress = _encoding.compress + + +def _Array_to_pandas(self: _encoding.Array, *, name: str | None = None, flatten: bool = False): + """Construct a Pandas dataframe from this Vortex array. + + Parameters + ---------- + obj : :class:`pyarrow.Array` or :class:`list` + The elements of this array or list become the elements of the Vortex array. + + name : :class:`str`, optional + The name of the column in the newly created dataframe. If unspecified, use `x`. + + flatten : :class:`bool` + If :obj:`True`, Struct columns are flattened in the dataframe. See the examples. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + + Construct a :class:`.pandas.DataFrame` with one column named `animals` from the contents of a Vortex + array: + + >>> array = vortex.encoding.array(['dog', 'cat', 'mouse', 'rat']) + >>> array.to_pandas(name='animals') + animals + 0 dog + 1 cat + 2 mouse + 3 rat + + Construct a :class:`.pandas.DataFrame` with the default column name: + + >>> array = vortex.encoding.array(['dog', 'cat', 'mouse', 'rat']) + >>> array.to_pandas() + x + 0 dog + 1 cat + 2 mouse + 3 rat + + Construct a dataframe with a Struct-typed column: + + >>> array = vortex.encoding.array([ + ... {'name': 'Joseph', 'age': 25}, + ... {'name': 'Narendra', 'age': 31}, + ... {'name': 'Angela', 'age': 33}, + ... {'name': 'Mikhail', 'age': 57}, + ... ]) + >>> array.to_pandas() + x + 0 {'age': 25, 'name': 'Joseph'} + 1 {'age': 31, 'name': 'Narendra'} + 2 {'age': 33, 'name': 'Angela'} + 3 {'age': 57, 'name': 'Mikhail'} + + Lift the struct fields to the top-level in the dataframe: + + >>> array.to_pandas(flatten=True) + x.age x.name + 0 25 Joseph + 1 31 Narendra + 2 33 Angela + 3 57 Mikhail + + """ + name = name or "x" + table = pyarrow.Table.from_arrays([self.to_arrow()], [name]) + if flatten: + table = table.flatten() + return table.to_pandas() + + +Array.to_pandas = _Array_to_pandas + + +def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True): + """Construct a NumPy array from this Vortex array. + + This is an alias for :code:`self.to_arrow().to_numpy(zero_copy_only)` + + Returns + ------- + :class:`numpy.ndarray` + + Examples + -------- + + Construct an ndarray from a Vortex array: + + >>> array = vortex.encoding.array([1, 0, 0, 1]) + >>> array.to_numpy() + array([1, 0, 0, 1]) + + """ + return self.to_arrow().to_numpy(zero_copy_only=zero_copy_only) + + +Array.to_numpy = _Array_to_numpy + + +def array(obj: pyarrow.Array | list) -> Array: + """The main entry point for creating Vortex arrays from other Python objects. + + This function is also available as ``vortex.array``. + + Parameters + ---------- + obj : :class:`pyarrow.Array` or :class:`list` + The elements of this array or list become the elements of the Vortex array. + + Returns + ------- + :class:`vortex.encoding.Array` + + Examples + -------- + + A Vortex array containing the first three integers. + + >>> vortex.encoding.array([1, 2, 3]).to_arrow() + + [ + 1, + 2, + 3 + ] + + The same Vortex array with a null value in the third position. + + >>> vortex.encoding.array([1, 2, None, 3]).to_arrow() + + [ + 1, + 2, + null, + 3 + ] + + Initialize a Vortex array from an Arrow array: + + >>> arrow = pyarrow.array(['Hello', 'it', 'is', 'me']) + >>> vortex.encoding.array(arrow).to_arrow() + + [ + "Hello", + "it", + "is", + "me" + ] + + """ + if isinstance(obj, list): + return _encoding._encode(pyarrow.array(obj)) + return _encoding._encode(obj) diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 2043c874b1..01baa833a8 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -9,8 +9,10 @@ use vortex::{Array, ArrayDType, IntoCanonical}; use crate::dtype::PyDType; use crate::error::PyVortexError; +use crate::python_repr::PythonRepr; #[pyclass(name = "Array", module = "vortex", sequence, subclass)] +/// An array of zero or more *rows* each with the same set of *columns*. pub struct PyArray { inner: Array, } @@ -27,14 +29,31 @@ impl PyArray { #[pymethods] impl PyArray { + /// Convert this array to an Arrow array. + /// + /// Returns + /// ------- + /// :class:`pyarrow.Array` + /// + /// Examples + /// -------- + /// + /// Round-trip an Arrow array through a Vortex array: + /// + /// >>> vortex.encoding.array([1, 2, 3]).to_arrow() + /// + /// [ + /// 1, + /// 2, + /// 3 + /// ] fn to_arrow(self_: PyRef<'_, Self>) -> PyResult> { // NOTE(ngates): for struct arrays, we could also return a RecordBatchStreamReader. - // NOTE(robert): Return RecordBatchStreamReader always? let py = self_.py(); let vortex = &self_.inner; - let chunks: Vec = if let Ok(chunked_array) = ChunkedArray::try_from(vortex) { - chunked_array + if let Ok(chunked_array) = ChunkedArray::try_from(vortex) { + let chunks: Vec = chunked_array .chunks() .map(|chunk| -> PyResult { Ok(chunk @@ -42,37 +61,32 @@ impl PyArray { .map_err(PyVortexError::map_err)? .into_arrow()) }) - .collect::>>()? + .collect::>>()?; + if chunks.is_empty() { + return Err(PyValueError::new_err("No chunks in array")); + } + let pa_data_type = chunks[0].data_type().clone().to_pyarrow(py)?; + let chunks: PyResult> = chunks + .iter() + .map(|arrow_array| arrow_array.into_data().to_pyarrow(py)) + .collect(); + + // Combine into a chunked array + PyModule::import_bound(py, "pyarrow")?.call_method( + "chunked_array", + (PyList::new_bound(py, chunks?),), + Some(&[("type", pa_data_type)].into_py_dict_bound(py)), + ) } else { - vec![vortex + Ok(vortex .clone() .into_canonical() .map_err(PyVortexError::map_err)? - .into_arrow()] - }; - if chunks.is_empty() { - return Err(PyValueError::new_err("No chunks in array")); + .into_arrow() + .into_data() + .to_pyarrow(py)? + .into_bound(py)) } - - // Export the schema once - let data_type = chunks[0].data_type().clone(); - let pa_data_type = data_type.to_pyarrow(py)?; - - // Iterate each chunk, export it to Arrow FFI, then import as a pyarrow array - let chunks: PyResult> = chunks - .iter() - .map(|arrow_array| arrow_array.into_data().to_pyarrow(py)) - .collect(); - - // Import pyarrow and its Array class - let mod_pyarrow = PyModule::import_bound(py, "pyarrow")?; - - // Combine into a chunked array - mod_pyarrow.call_method( - "chunked_array", - (PyList::new_bound(py, chunks?),), - Some(&[("type", pa_data_type)].into_py_dict_bound(py)), - ) } fn __len__(&self) -> usize { @@ -93,14 +107,79 @@ impl PyArray { self.inner.nbytes() } + /// The data type of this array. + /// + /// Returns + /// ------- + /// :class:`vortex.dtype.DType` + /// + /// Examples + /// -------- + /// + /// By default, :func:`vortex.encoding.array` uses the largest available bit-width: + /// + /// >>> vortex.encoding.array([1, 2, 3]).dtype + /// int(64, False) + /// + /// Including a :obj:`None` forces a nullable type: + /// + /// >>> vortex.encoding.array([1, None, 2, 3]).dtype + /// int(64, True) + /// + /// A UTF-8 string array: + /// + /// >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).dtype + /// utf8(False) #[getter] fn dtype(self_: PyRef) -> PyResult> { PyDType::wrap(self_.py(), self_.inner.dtype().clone()) } - fn take<'py>(&self, indices: PyRef<'py, Self>) -> PyResult> { - take(&self.inner, indices.unwrap()) + /// Filter, permute, and/or repeat elements by their index. + /// + /// Returns + /// ------- + /// :class:`vortex.encoding.Array` + /// + /// Examples + /// -------- + /// + /// Keep only the first and third elements: + /// + /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.encoding.array([0, 2]) + /// >>> a.take(indices).to_arrow() + /// + /// [ + /// "a", + /// "c" + /// ] + /// + /// Permute and repeat the first and second elements: + /// + /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.encoding.array([0, 1, 1, 0]) + /// >>> a.take(indices).to_arrow() + /// + /// [ + /// "a", + /// "b", + /// "b", + /// "a" + /// ] + fn take<'py>(&self, indices: &Bound<'py, PyArray>) -> PyResult> { + let py = indices.py(); + let indices = &indices.borrow().inner; + + if !indices.dtype().is_int() { + return Err(PyValueError::new_err(format!( + "indices: expected int or uint array, but found: {}", + indices.dtype().python_repr() + ))); + } + + take(&self.inner, indices) .map_err(PyVortexError::map_err) - .and_then(|arr| Bound::new(indices.py(), PyArray { inner: arr })) + .and_then(|arr| Bound::new(py, PyArray { inner: arr })) } } diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index 71c29be9fa..009f1ef3a2 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -1,45 +1,45 @@ -use std::sync::Arc; - -use pyo3::types::PyType; -use pyo3::{pyclass, pyfunction, pymethods, Py, PyResult, Python}; -use vortex::compress::{CompressConfig, CompressCtx}; +use pyo3::prelude::*; +use vortex_sampling_compressor::SamplingCompressor; use crate::array::PyArray; use crate::error::PyVortexError; -#[derive(Clone)] -#[pyclass(name = "CompressConfig", module = "vortex")] -pub struct PyCompressConfig { - inner: CompressConfig, -} - -#[pymethods] -impl PyCompressConfig { - #[classmethod] - pub fn default(cls: &PyType) -> PyResult> { - Py::new(cls.py(), ::default()) - } -} - -impl Default for PyCompressConfig { - fn default() -> Self { - Self { - inner: CompressConfig::new().with_enabled(ENCODINGS.iter().cloned()), - } - } -} - #[pyfunction] -#[pyo3(signature = (arr, opts = None))] -pub fn compress( - py: Python<'_>, - arr: &PyArray, - opts: Option, -) -> PyResult> { - let compress_opts = opts.unwrap_or_default().inner; - let ctx = CompressCtx::new(Arc::new(compress_opts)); - let compressed = py - .allow_threads(|| ctx.compress(arr.unwrap(), None)) - .map_err(PyVortexError::map_err)?; +/// Attempt to compress a vortex array. +/// +/// Parameters +/// ---------- +/// array : :class:`vortex.encoding.Array` +/// The array. +/// +/// Examples +/// -------- +/// +/// Compress a very sparse array of integers: +/// +/// >>> a = vortex.encoding.array([42 for _ in range(1000)]) +/// >>> str(vortex.encoding.compress(a)) +/// 'vortex.constant(0x0a)(i64, len=1000)' +/// +/// Compress an array of increasing integers: +/// +/// >>> a = vortex.encoding.array(list(range(1000))) +/// >>> str(vortex.encoding.compress(a)) +/// 'fastlanes.for(0x0f)(i64, len=1000)' +/// +/// Compress an array of increasing floating-point numbers and a few nulls: +/// +/// >>> a = vortex.encoding.array([ +/// ... float(x) if x % 20 != 0 else None +/// ... for x in range(1000) +/// ... ]) +/// >>> str(vortex.encoding.compress(a)) +/// 'vortex.alp(0x0d)(f64?, len=1000)' +pub fn compress<'py>(array: &Bound<'py, PyArray>) -> PyResult> { + let compressor = SamplingCompressor::default(); + let inner = compressor + .compress(array.borrow().unwrap(), None) + .map_err(PyVortexError::new)? + .into_array(); Bound::new(array.py(), PyArray::new(inner)) } diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs index 548f0cd563..499beb3edd 100644 --- a/pyvortex/src/dtype.rs +++ b/pyvortex/src/dtype.rs @@ -1,11 +1,16 @@ use arrow::datatypes::{DataType, Field}; use arrow::pyarrow::FromPyArrow; +use pyo3::exceptions::PyValueError; use pyo3::types::PyType; -use pyo3::{pyclass, pymethods, Bound, Py, PyAny, PyResult, Python}; +use pyo3::{pyclass, pyfunction, pymethods, Bound, Py, PyAny, PyResult, Python}; use vortex::arrow::FromArrowType; -use vortex_dtype::DType; +use vortex_dtype::{DType, PType}; + +use crate::python_repr::PythonRepr; #[pyclass(name = "DType", module = "vortex", subclass)] +/// A data type describes the set of operations available on a given column. These operations are +/// implemented by the column *encoding*. Each data type is implemented by one or more encodings. pub struct PyDType { inner: DType, } @@ -26,8 +31,12 @@ impl PyDType { format!("{}", self.inner) } + fn __repr__(&self) -> String { + self.inner.python_repr().to_string() + } + #[classmethod] - fn from_pyarrow( + fn from_arrow( cls: &Bound, #[pyo3(from_py_with = "import_arrow_dtype")] arrow_dtype: DataType, nullable: bool, @@ -37,8 +46,253 @@ impl PyDType { DType::from_arrow(&Field::new("_", arrow_dtype, nullable)), ) } + + fn maybe_columns(&self) -> Option> { + match &self.inner { + DType::Null => None, + DType::Bool(_) => None, + DType::Primitive(..) => None, + DType::Utf8(_) => None, + DType::Binary(_) => None, + DType::Struct(child, _) => Some(child.names().iter().map(|x| x.to_string()).collect()), + DType::List(..) => None, + DType::Extension(..) => None, + } + } } fn import_arrow_dtype(obj: &Bound) -> PyResult { DataType::from_pyarrow_bound(obj) } + +#[pyfunction(name = "null")] +#[pyo3(signature = ())] +/// Construct the data type for a column containing only the null value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting only :obj:`None`. +/// +/// >>> vortex.dtype.null() +/// null() +pub fn dtype_null(py: Python<'_>) -> PyResult> { + PyDType::wrap(py, DType::Null) +} + +#[pyfunction(name = "bool")] +#[pyo3(signature = (nullable = false))] +/// Construct a Boolean data type. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None`, :obj:`True`, and :obj:`False`. +/// +/// >>> vortex.dtype.bool(True) +/// bool(True) +/// +/// A data type permitting just :obj:`True` and :obj:`False`. +/// +/// >>> vortex.dtype.bool(False) +/// bool(False) +pub fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Bool(nullable.into())) +} + +#[pyfunction(name = "int")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct a signed integral data type. +/// +/// Parameters +/// ---------- +/// width : one of 8, 16, 32, and 64. +/// +/// The bit width determines the span of valid values. If :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` and the integers from -128 to 127, inclusive: +/// +/// >>> vortex.dtype.int(8, True) +/// int(8, True) +/// +/// A data type permitting just the integers from -2,147,483,648 to 2,147,483,647, inclusive: +/// +/// >>> vortex.dtype.int(32, False) +/// int(32, False) +pub fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 8 => DType::Primitive(PType::I8, nullable.into()), + 16 => DType::Primitive(PType::I16, nullable.into()), + 32 => DType::Primitive(PType::I32, nullable.into()), + 64 => DType::Primitive(PType::I64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid int width")), + } + } else { + DType::Primitive(PType::I64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "uint")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct an unsigned integral data type. +/// +/// Parameters +/// ---------- +/// width : one of 8, 16, 32, and 64. +/// +/// The bit width determines the span of valid values. If :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` and the integers from 0 to 255, inclusive: +/// +/// >>> vortex.dtype.uint(8, True) +/// uint(8, True) +/// +/// A data type permitting just the integers from 0 to 4,294,967,296 inclusive: +/// +/// >>> vortex.dtype.uint(32, False) +/// uint(32, False) +pub fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 8 => DType::Primitive(PType::U8, nullable.into()), + 16 => DType::Primitive(PType::U16, nullable.into()), + 32 => DType::Primitive(PType::U32, nullable.into()), + 64 => DType::Primitive(PType::U64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid uint width")), + } + } else { + DType::Primitive(PType::U64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "float")] +#[pyo3(signature = (width = None, nullable = false))] +/// Construct an IEEE 754 binary floating-point data type. +/// +/// Parameters +/// ---------- +/// width : one of 16, 32, and 64. +/// +/// The bit width determines the range and precision of the floating-point values. If +/// :obj:`None`, 64 is used. +/// +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting :obj:`None` as well as IEEE 754 binary16 floating-point values. Values +/// larger than 65,520 or less than -65,520 will respectively round to positive and negative +/// infinity. +/// +/// >>> vortex.dtype.float(16, False) +/// float(16, False) +pub fn dtype_float(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { + let dtype = if let Some(width) = width { + match width { + 16 => DType::Primitive(PType::F16, nullable.into()), + 32 => DType::Primitive(PType::F32, nullable.into()), + 64 => DType::Primitive(PType::F64, nullable.into()), + _ => return Err(PyValueError::new_err("Invalid float width")), + } + } else { + DType::Primitive(PType::F64, nullable.into()) + }; + PyDType::wrap(py, dtype) +} + +#[pyfunction(name = "utf8")] +#[pyo3(signature = (nullable = false))] +/// Construct a UTF-8-encoded string data type. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting any UTF-8-encoded string, such as :code:`"Hello World"`, but not +/// permitting :obj:`None`. +/// +/// >>> vortex.dtype.utf8(False) +/// utf8(False) +pub fn dtype_utf8(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Utf8(nullable.into())) +} + +#[pyfunction(name = "binary")] +#[pyo3(signature = (nullable = false))] +/// Construct a data type for binary strings. +/// +/// Parameters +/// ---------- +/// nullable : :class:`bool` +/// +/// When :obj:`True`, :obj:`None` is a permissible value. +/// +/// Returns +/// ------- +/// :class:`vortex.dtype.DType` +/// +/// Examples +/// -------- +/// +/// A data type permitting any string of bytes but not permitting :obj:`None`. +/// +/// >>> vortex.dtype.binary(False) +/// binary(False) +pub fn dtype_binary(py: Python<'_>, nullable: bool) -> PyResult> { + PyDType::wrap(py, DType::Binary(nullable.into())) +} diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index 850fc45563..ced01879cc 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -13,10 +13,9 @@ use vortex_dtype::DType; use crate::array::PyArray; use crate::error::PyVortexError; -/// The main entry point for creating enc arrays from other Python objects. -/// +// Private, ergo not documented. #[pyfunction] -pub fn encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { +pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { let pa = obj.py().import_bound("pyarrow")?; let pa_array = pa.getattr("Array")?; let chunked_array = pa.getattr("ChunkedArray")?; diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs new file mode 100644 index 0000000000..2af04e49f2 --- /dev/null +++ b/pyvortex/src/expr.rs @@ -0,0 +1,293 @@ +use std::sync::Arc; + +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::*; +use vortex_dtype::field::Field; +use vortex_dtype::half::f16; +use vortex_dtype::{DType, Nullability, PType}; +use vortex_expr::{BinaryExpr, Column, Literal, Operator, VortexExpr}; +use vortex_scalar::{PValue, Scalar, ScalarValue}; + +use crate::dtype::PyDType; + +/// An expression describes how to filter rows when reading an array from a file. +/// +/// Examples +/// ======== +/// +/// All the examples read the following file. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// +/// Read only those rows whose age column is greater than 35: +/// +/// >>> e = vortex.io.read("a.vortex", row_filter = vortex.expr.column("age") > 35) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 57 +/// ] +/// -- child 1 type: string +/// [ +/// "Mikhail" +/// ] +/// +/// Read only those rows whose age column lies in (21, 33]. Notice that we must use parentheses +/// because of the Python precedence rules for ``&``: +/// +/// >>> age = vortex.expr.column("age") +/// >>> e = vortex.io.read("a.vortex", row_filter = (age > 21) & (age <= 33)) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph", +/// null +/// ] +/// +/// Read only those rows whose name is `Joseph`: +/// +/// >>> name = vortex.expr.column("name") +/// >>> e = vortex.io.read("a.vortex", row_filter = name == "Joseph") +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph" +/// ] +/// +/// Read rows whose name is `Angela` or whose age is between 20 and 30, inclusive. Notice that the +/// Angela row is excluded because its age is null. The entire row filtering expression therefore +/// evaluates to null which is interpreted as false: +/// +/// >>> name = vortex.expr.column("name") +/// >>> e = vortex.io.read("a.vortex", row_filter = (name == "Angela") | ((age >= 20) & (age <= 30))) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25 +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph" +/// ] +#[pyclass(name = "Expr", module = "vortex")] +pub struct PyExpr { + inner: Arc, +} + +impl PyExpr { + pub fn unwrap(&self) -> &Arc { + &self.inner + } +} + +fn py_binary_opeartor<'py>( + left: PyRef<'py, PyExpr>, + operator: Operator, + right: Bound<'py, PyExpr>, +) -> PyResult> { + Bound::new( + left.py(), + PyExpr { + inner: Arc::new(BinaryExpr::new( + left.inner.clone(), + operator, + right.borrow().inner.clone(), + )), + }, + ) +} + +fn coerce_expr<'py>(value: &Bound<'py, PyAny>) -> PyResult> { + let nonnull = Nullability::NonNullable; + if let Ok(value) = value.downcast::() { + Ok(value.clone()) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Null, value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Primitive(PType::I64, nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Primitive(PType::F64, nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Utf8(nonnull), value) + } else if let Ok(value) = value.downcast::() { + scalar(DType::Binary(nonnull), value) + } else { + Err(PyValueError::new_err(format!( + "expected None, int, float, str, or bytes but found: {}", + value + ))) + } +} + +#[pymethods] +impl PyExpr { + fn __eq__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Eq, coerce_expr(right)?) + } + + fn __neq__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::NotEq, coerce_expr(right)?) + } + + fn __gt__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Gt, coerce_expr(right)?) + } + + fn __ge__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Gte, coerce_expr(right)?) + } + + fn __lt__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Lt, coerce_expr(right)?) + } + + fn __le__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Lte, coerce_expr(right)?) + } + + fn __and__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::And, coerce_expr(right)?) + } + + fn __or__<'py>( + self_: PyRef<'py, Self>, + right: &Bound<'py, PyAny>, + ) -> PyResult> { + py_binary_opeartor(self_, Operator::Or, coerce_expr(right)?) + } +} + +/// A named column. +/// +/// See :class:`.Expr` for more examples. +/// +/// Example +/// ======= +/// +/// A filter that selects only those rows whose name is `Joseph`: +/// +/// >>> name = vortex.expr.column("name") +/// >>> filter = name == "Joseph" +/// +#[pyfunction] +pub fn column<'py>(name: &Bound<'py, PyString>) -> PyResult> { + let py = name.py(); + let name: String = name.extract()?; + Bound::new( + py, + PyExpr { + inner: Arc::new(Column::new(Field::Name(name))), + }, + ) +} + +#[pyfunction] +pub fn _literal<'py>( + dtype: &Bound<'py, PyDType>, + value: &Bound<'py, PyAny>, +) -> PyResult> { + scalar(dtype.borrow().unwrap().clone(), value) +} + +pub fn scalar<'py>(dtype: DType, value: &Bound<'py, PyAny>) -> PyResult> { + let py = value.py(); + Bound::new( + py, + PyExpr { + inner: Arc::new(Literal::new(Scalar::new( + dtype.clone(), + scalar_value(dtype, value)?, + ))), + }, + ) +} + +pub fn scalar_value(dtype: DType, value: &Bound<'_, PyAny>) -> PyResult { + match dtype { + DType::Null => { + value.downcast::()?; + Ok(ScalarValue::Null) + } + DType::Bool(_) => { + let value = value.downcast::()?; + Ok(ScalarValue::Bool(value.extract()?)) + } + DType::Primitive(ptype, _) => { + let pvalue = match ptype { + PType::I8 => PValue::I8(value.extract()?), + PType::I16 => PValue::I16(value.extract()?), + PType::I32 => PValue::I32(value.extract()?), + PType::I64 => PValue::I64(value.extract()?), + PType::U8 => PValue::U8(value.extract()?), + PType::U16 => PValue::U16(value.extract()?), + PType::U32 => PValue::U32(value.extract()?), + PType::U64 => PValue::U64(value.extract()?), + PType::F16 => { + let float = value.extract::()?; + PValue::F16(f16::from_f32(float)) + } + PType::F32 => PValue::F32(value.extract()?), + PType::F64 => PValue::F64(value.extract()?), + }; + Ok(ScalarValue::Primitive(pvalue)) + } + DType::Utf8(_) => Ok(ScalarValue::BufferString(value.extract::()?.into())), + DType::Binary(_) => Ok(ScalarValue::Buffer(value.extract::<&[u8]>()?.into())), + DType::Struct(..) => todo!(), + DType::List(element_type, _) => { + let list = value.downcast::(); + let values: Vec = list + .iter() + .map(|element| scalar_value(element_type.as_ref().clone(), element)) + .collect::>>()?; + Ok(ScalarValue::List(Arc::from(values.into_boxed_slice()))) + } + DType::Extension(..) => todo!(), + } +} diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs new file mode 100644 index 0000000000..6a22f9cc31 --- /dev/null +++ b/pyvortex/src/io.rs @@ -0,0 +1,250 @@ +use std::path::Path; + +use futures::TryStreamExt; +use pyo3::exceptions::PyTypeError; +use pyo3::prelude::*; +use pyo3::pyfunction; +use pyo3::types::{PyList, PyLong, PyString}; +use tokio::fs::File; +use vortex::array::ChunkedArray; +use vortex::{Array, Context}; +use vortex_dtype::field::Field; +use vortex_error::VortexResult; +use vortex_serde::layouts::{ + LayoutContext, LayoutDeserializer, LayoutReaderBuilder, LayoutWriter, Projection, RowFilter, +}; + +use crate::error::PyVortexError; +use crate::expr::PyExpr; +use crate::PyArray; + +/// Read a vortex struct array from the local filesystem. +/// +/// Parameters +/// ---------- +/// f : :class:`str` +/// +/// The file path. +/// +/// Examples +/// -------- +/// +/// Read an array with a structured column and nulls at multiple levels and in multiple columns. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// >>> b = vortex.io.read("a.vortex") +/// >>> b.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31, +/// null, +/// 57, +/// null +/// ] +/// -- child 1 type: string +/// [ +/// "Joseph", +/// null, +/// "Angela", +/// "Mikhail", +/// null +/// ] +/// +/// Read just the age column: +/// +/// >>> c = vortex.io.read("a.vortex", projection = ["age"]) +/// >>> c.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 25, +/// 31, +/// null, +/// 57, +/// null +/// ] +/// +/// Read just the name column, by its index: +/// +/// >>> d = vortex.io.read("a.vortex", projection = [1]) +/// >>> d.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: string +/// [ +/// "Joseph", +/// null, +/// "Angela", +/// "Mikhail", +/// null +/// ] +/// +/// +/// Keep rows with an age above 35. This will read O(N_KEPT) rows, when the file format allows. +/// +/// >>> e = vortex.io.read("a.vortex", row_filter = vortex.expr.column("age") > 35) +/// >>> e.to_arrow() +/// +/// -- is_valid: all not null +/// -- child 0 type: int64 +/// [ +/// 57 +/// ] +/// -- child 1 type: string +/// [ +/// "Mikhail" +/// ] +/// +/// TODO(DK): Repeating a column in a projection does not work +/// +/// Read the age column by name, twice, and the name column by index, once: +/// +/// >>> # e = vortex.io.read("a.vortex", projection = ["age", 1, "age"]) +/// >>> # e.to_arrow() +/// +/// TODO(DK): Top-level nullness does not work. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'name': 'Joseph', 'age': 25}, +/// ... {'name': None, 'age': 31}, +/// ... {'name': 'Angela', 'age': None}, +/// ... None, +/// ... {'name': 'Mikhail', 'age': 57}, +/// ... {'name': None, 'age': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// >>> b = vortex.io.read("a.vortex") +/// >>> # b.to_arrow() +/// +#[pyfunction] +#[pyo3(signature = (f, projection = None, row_filter = None))] +pub fn read<'py>( + f: &Bound<'py, PyString>, + projection: Option<&Bound<'py, PyAny>>, + row_filter: Option<&Bound<'py, PyExpr>>, +) -> PyResult> { + async fn run( + fname: &str, + projection: Projection, + row_filter: Option, + ) -> VortexResult { + let file = File::open(Path::new(fname)).await?; + + let mut builder: LayoutReaderBuilder = LayoutReaderBuilder::new( + file, + LayoutDeserializer::new(Context::default().into(), LayoutContext::default().into()), + ) + .with_projection(projection); + + if let Some(row_filter) = row_filter { + builder = builder.with_row_filter(row_filter); + } + + let stream = builder.build().await?; + + let dtype = stream.schema().into_dtype(); + + let vecs: Vec = stream.try_collect().await?; + + if vecs.len() == 1 { + Ok(vecs.into_iter().next().unwrap()) + } else { + ChunkedArray::try_new(vecs, dtype).map(|e| e.into()) + } + } + + let fname = f.to_str()?; // TODO(dk): support file objects + + let projection = match projection { + None => Projection::All, + Some(projection) => { + let list: &Bound<'py, PyList> = projection.downcast()?; + Projection::Flat( + list.iter() + .map(|field| -> PyResult { + if field.clone().is_instance_of::() { + Ok(Field::Name( + field.downcast::()?.to_str()?.to_string(), + )) + } else if field.is_instance_of::() { + Ok(Field::Index(field.extract()?)) + } else { + Err(PyTypeError::new_err(format!( + "projection: expected list of string, int, and None, but found: {}.", + field, + ))) + } + }) + .collect::>>()?, + ) + } + }; + + let row_filter = row_filter.map(|x| RowFilter::new(x.borrow().unwrap().clone())); + + let inner = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()? + .block_on(run(fname, projection, row_filter)) + .map_err(PyVortexError::new)?; + + Bound::new(f.py(), PyArray::new(inner)) +} + +#[pyfunction] +/// Write a vortex struct array to the local filesystem. +/// +/// Parameters +/// ---------- +/// array : :class:`vortex.encoding.Array` +/// +/// The array. Must be an array of structures. +/// +/// f : :class:`str` +/// +/// The file path. +/// +/// Examples +/// -------- +/// +/// Write the array `a` to the local file `a.vortex`. +/// +/// >>> a = vortex.encoding.array([ +/// ... {'x': 1}, +/// ... {'x': 2}, +/// ... {'x': 10}, +/// ... {'x': 11}, +/// ... {'x': None}, +/// ... ]) +/// >>> vortex.io.write(a, "a.vortex") +/// +pub fn write(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResult<()> { + async fn run(array: &Array, fname: &str) -> VortexResult<()> { + let file = File::create(Path::new(fname)).await?; + let mut writer = LayoutWriter::new(file); + + writer = writer.write_array_columns(array.clone()).await?; + writer.finalize().await?; + Ok(()) + } + + let fname = f.to_str()?; // TODO(dk): support file objects + let array = array.borrow().unwrap().clone(); + + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()? + .block_on(run(&array, fname)) + .map_err(PyVortexError::map_err) +} diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index da2fbd79bd..d8629e6850 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -1,92 +1,54 @@ #![allow(unsafe_op_in_unsafe_fn)] -use dtype::PyDType; -use pyo3::exceptions::PyValueError; +use array::PyArray; +use expr::PyExpr; use pyo3::prelude::*; -use vortex_dtype::{DType, PType}; mod array; +mod compress; mod dtype; mod encode; mod error; +mod expr; +mod io; +mod python_repr; -/// A Python module implemented in Rust. +/// Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. #[pymodule] -fn _lib(_py: Python, m: &Bound) -> PyResult<()> { +fn _lib(py: Python, m: &Bound) -> PyResult<()> { pyo3_log::init(); - m.add_function(wrap_pyfunction!(encode::encode, m)?)?; - // m.add_function(wrap_pyfunction!(compress::compress, m)?)?; + let dtype = PyModule::new_bound(py, "dtype")?; + m.add_submodule(&dtype)?; - m.add_class::()?; + dtype.add_class::()?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_null, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_bool, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_int, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_uint, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_float, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_utf8, m)?)?; + dtype.add_function(wrap_pyfunction!(dtype::dtype_binary, m)?)?; - m.add_function(wrap_pyfunction!(dtype_int, m)?)?; - m.add_function(wrap_pyfunction!(dtype_uint, m)?)?; - m.add_function(wrap_pyfunction!(dtype_float, m)?)?; - m.add_function(wrap_pyfunction!(dtype_bool, m)?)?; - m.add_function(wrap_pyfunction!(dtype_utf8, m)?)?; + let encoding = PyModule::new_bound(py, "encoding")?; + m.add_submodule(&encoding)?; - Ok(()) -} + encoding.add_function(wrap_pyfunction!(encode::_encode, m)?)?; + encoding.add_function(wrap_pyfunction!(compress::compress, m)?)?; -#[pyfunction(name = "bool")] -#[pyo3(signature = (nullable = false))] -fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> { - PyDType::wrap(py, DType::Bool(nullable.into())) -} + encoding.add_class::()?; -#[pyfunction(name = "int")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 8 => DType::Primitive(PType::I8, nullable.into()), - 16 => DType::Primitive(PType::I16, nullable.into()), - 32 => DType::Primitive(PType::I32, nullable.into()), - 64 => DType::Primitive(PType::I64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid int width")), - } - } else { - DType::Primitive(PType::I64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + let io = PyModule::new_bound(py, "io")?; + m.add_submodule(&io)?; -#[pyfunction(name = "uint")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 8 => DType::Primitive(PType::U8, nullable.into()), - 16 => DType::Primitive(PType::U16, nullable.into()), - 32 => DType::Primitive(PType::U32, nullable.into()), - 64 => DType::Primitive(PType::U64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid uint width")), - } - } else { - DType::Primitive(PType::U64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + io.add_function(wrap_pyfunction!(io::read, m)?)?; + io.add_function(wrap_pyfunction!(io::write, m)?)?; -#[pyfunction(name = "float")] -#[pyo3(signature = (width = None, nullable = false))] -fn dtype_float(py: Python<'_>, width: Option, nullable: bool) -> PyResult> { - let dtype = if let Some(width) = width { - match width { - 16 => DType::Primitive(PType::F16, nullable.into()), - 32 => DType::Primitive(PType::F32, nullable.into()), - 64 => DType::Primitive(PType::F64, nullable.into()), - _ => return Err(PyValueError::new_err("Invalid float width")), - } - } else { - DType::Primitive(PType::F64, nullable.into()) - }; - PyDType::wrap(py, dtype) -} + let expr = PyModule::new_bound(py, "expr")?; + m.add_submodule(&expr)?; -#[pyfunction(name = "utf8")] -#[pyo3(signature = (nullable = false))] -fn dtype_utf8(py: Python<'_>, nullable: bool) -> PyResult> { - PyDType::wrap(py, DType::Utf8(nullable.into())) + expr.add_function(wrap_pyfunction!(expr::column, m)?)?; + expr.add_class::()?; + + Ok(()) } diff --git a/pyvortex/src/python_repr.rs b/pyvortex/src/python_repr.rs new file mode 100644 index 0000000000..83e34899a2 --- /dev/null +++ b/pyvortex/src/python_repr.rs @@ -0,0 +1,107 @@ +use std::convert::AsRef; +use std::fmt::{Display, Formatter}; + +use itertools::Itertools; +use vortex_dtype::{DType, ExtID, ExtMetadata, Nullability, PType}; + +pub trait PythonRepr { + fn python_repr(&self) -> impl Display; +} + +struct DTypePythonRepr<'a>(&'a DType); + +impl PythonRepr for DType { + fn python_repr(&self) -> impl Display { + return DTypePythonRepr(self); + } +} + +impl Display for DTypePythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let DTypePythonRepr(x) = self; + match x { + DType::Null => write!(f, "null()"), + DType::Bool(n) => write!(f, "bool({})", n.python_repr()), + DType::Primitive(p, n) => match p { + PType::U8 | PType::U16 | PType::U32 | PType::U64 => { + write!(f, "uint({}, {})", p.bit_width(), n.python_repr()) + } + PType::I8 | PType::I16 | PType::I32 | PType::I64 => { + write!(f, "int({}, {})", p.bit_width(), n.python_repr()) + } + PType::F16 | PType::F32 | PType::F64 => { + write!(f, "float({}, {})", p.bit_width(), n.python_repr()) + } + }, + DType::Utf8(n) => write!(f, "utf8({})", n.python_repr()), + DType::Binary(n) => write!(f, "binary({})", n.python_repr()), + DType::Struct(st, n) => write!( + f, + "struct({{{}}}, {})", + st.names() + .iter() + .zip(st.dtypes().iter()) + .map(|(n, dt)| format!("\"{}\": {}", n, dt.python_repr())) + .join(", "), + n.python_repr() + ), + DType::List(c, n) => write!(f, "list({}, {})", c.python_repr(), n.python_repr()), + DType::Extension(ext, n) => { + write!(f, "ext(\"{}\", ", ext.id().python_repr())?; + match ext.metadata() { + None => write!(f, "None")?, + Some(metadata) => write!(f, "{}", metadata.python_repr())?, + }; + write!(f, ", {})", n.python_repr()) + } + } + } +} + +struct NullabilityPythonRepr<'a>(&'a Nullability); + +impl PythonRepr for Nullability { + fn python_repr(&self) -> impl Display { + return NullabilityPythonRepr(self); + } +} + +impl Display for NullabilityPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let NullabilityPythonRepr(x) = self; + match x { + Nullability::NonNullable => write!(f, "False"), + Nullability::Nullable => write!(f, "True"), + } + } +} + +struct ExtMetadataPythonRepr<'a>(&'a ExtMetadata); + +impl PythonRepr for ExtMetadata { + fn python_repr(&self) -> impl Display { + return ExtMetadataPythonRepr(self); + } +} + +impl Display for ExtMetadataPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let ExtMetadataPythonRepr(metadata) = self; + write!(f, "\"{}\"", metadata.as_ref().escape_ascii()) + } +} + +struct ExtIDPythonRepr<'a>(&'a ExtID); + +impl PythonRepr for ExtID { + fn python_repr(&self) -> impl Display { + ExtIDPythonRepr(self) + } +} + +impl Display for ExtIDPythonRepr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let ExtIDPythonRepr(ext_id) = self; + write!(f, "\"{}\"", ext_id.as_ref().escape_default()) + } +} diff --git a/pyvortex/test/test_array.py b/pyvortex/test/test_array.py index 8acf7ab2fd..66b896586b 100644 --- a/pyvortex/test/test_array.py +++ b/pyvortex/test/test_array.py @@ -4,25 +4,25 @@ def test_primitive_array_round_trip(): a = pa.array([0, 1, 2, 3]) - arr = vortex.encode(a) - assert arr.to_arrow().combine_chunks() == a + arr = vortex.array(a) + assert arr.to_arrow() == a def test_array_with_nulls(): a = pa.array([b"123", None]) - arr = vortex.encode(a) - assert arr.to_arrow().combine_chunks() == a + arr = vortex.array(a) + assert arr.to_arrow() == a def test_varbin_array_round_trip(): a = pa.array(["a", "b", "c"]) - arr = vortex.encode(a) - assert arr.to_arrow().combine_chunks() == a + arr = vortex.array(a) + assert arr.to_arrow() == a def test_varbin_array_take(): - a = vortex.encode(pa.array(["a", "b", "c", "d"])) - assert a.take(vortex.encode(pa.array([0, 2]))).to_arrow().combine_chunks() == pa.array( + a = vortex.array(pa.array(["a", "b", "c", "d"])) + assert a.take(vortex.array(pa.array([0, 2]))).to_arrow() == pa.array( ["a", "c"], type=pa.utf8(), ) @@ -30,5 +30,5 @@ def test_varbin_array_take(): def test_empty_array(): a = pa.array([], type=pa.uint8()) - primitive = vortex.encode(a) + primitive = vortex.array(a) assert primitive.to_arrow().type == pa.uint8() diff --git a/pyvortex/test/test_compress.py b/pyvortex/test/test_compress.py index 368e7c488b..0b089a5899 100644 --- a/pyvortex/test/test_compress.py +++ b/pyvortex/test/test_compress.py @@ -11,54 +11,54 @@ @pytest.mark.xfail(reason="Not yet implemented") def test_primitive_compress(): a = pa.array([0, 0, 0, 0, 9, 9, 9, 9, 1, 5]) - arr_compressed = vortex.compress(vortex.encode(a)) - assert not isinstance(arr_compressed, vortex.PrimitiveArray) + arr_compressed = vortex.compress(vortex.array(a)) + assert not isinstance(arr_compressed, vortex.encoding.PrimitiveArray) assert arr_compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_for_compress(): a = pa.array(np.arange(10_000) + 10_000_000) - arr_compressed = vortex.compress(vortex.encode(a)) - assert not isinstance(arr_compressed, vortex.PrimitiveArray) + arr_compressed = vortex.compress(vortex.array(a)) + assert not isinstance(arr_compressed, vortex.encoding.PrimitiveArray) @pytest.mark.xfail(reason="Not yet implemented") def test_bool_compress(): - a = vortex.encode(pa.array([False] * 10_000 + [True] * 10_000)) + a = vortex.array(pa.array([False] * 10_000 + [True] * 10_000)) arr_compressed = vortex.compress(a) assert len(arr_compressed) == 20_000 - assert isinstance(arr_compressed, vortex.RoaringBoolArray) + assert isinstance(arr_compressed, vortex.encoding.RoaringBoolArray) assert arr_compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_roaring_bool_encode(): - a = vortex.encode(pa.array([True] * 10_000)) - rarr = vortex.RoaringBoolArray.encode(a) - assert isinstance(rarr, vortex.RoaringBoolArray) + a = vortex.array(pa.array([True] * 10_000)) + rarr = vortex.encoding.RoaringBoolArray.encode(a) + assert isinstance(rarr, vortex.encoding.RoaringBoolArray) assert rarr.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_arange_encode(): - a = vortex.encode(pa.array(np.arange(10_000), type=pa.uint32())) + a = vortex.array(pa.array(np.arange(10_000), type=pa.uint32())) compressed = vortex.compress(a) - assert isinstance(compressed, vortex.DeltaArray) or isinstance(compressed, vortex.RoaringIntArray) + assert isinstance(compressed, vortex.encoding.DeltaArray) or isinstance(compressed, vortex.encoding.RoaringIntArray) assert compressed.nbytes < a.nbytes @pytest.mark.xfail(reason="Not yet implemented") def test_zigzag_encode(): - a = vortex.encode(pa.array([-1, -1, 0, -1, 1, -1])) - zarr = vortex.ZigZagArray.encode(a) - assert isinstance(zarr, vortex.ZigZagArray) + a = vortex.array(pa.array([-1, -1, 0, -1, 1, -1])) + zarr = vortex.encoding.ZigZagArray.encode(a) + assert isinstance(zarr, vortex.encoding.ZigZagArray) # TODO(ngates): support decoding once we have decompressor. def test_chunked_encode(): chunked = pa.chunked_array([pa.array([0, 1, 2]), pa.array([3, 4, 5])]) - encoded = vortex.encode(chunked) + encoded = vortex.array(chunked) assert encoded.to_arrow().combine_chunks() == pa.array([0, 1, 2, 3, 4, 5]) @@ -69,7 +69,7 @@ def test_table_encode(): "string": pa.chunked_array([pa.array(["a", "b", "c"]), pa.array(["d", "e", "f"])]), } ) - encoded = vortex.encode(table) + encoded = vortex.array(table) assert encoded.to_arrow().combine_chunks() == pa.StructArray.from_arrays( [pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"])], names=["number", "string"] ) @@ -79,6 +79,6 @@ def test_table_encode(): def test_taxi(): curdir = Path(os.path.dirname(__file__)).parent.parent table = pq.read_table(curdir / "bench-vortex/data/yellow-tripdata-2023-11.parquet") - compressed = vortex.compress(vortex.encode(table[:100])) + compressed = vortex.compress(vortex.array(table[:100])) decompressed = compressed.to_arrow() assert not decompressed diff --git a/pyvortex/test/test_dtype.py b/pyvortex/test/test_dtype.py index 207a8ebc53..2d0dccd1b5 100644 --- a/pyvortex/test/test_dtype.py +++ b/pyvortex/test/test_dtype.py @@ -2,9 +2,9 @@ def test_int(): - assert str(vortex.int()) == "i64" - assert str(vortex.int(32)) == "i32" - assert str(vortex.int(32, nullable=True)) == "i32?" - assert str(vortex.uint(32)) == "u32" - assert str(vortex.float(16)) == "f16" - assert str(vortex.bool(nullable=True)) == "bool?" + assert str(vortex.dtype.int()) == "i64" + assert str(vortex.dtype.int(32)) == "i32" + assert str(vortex.dtype.int(32, nullable=True)) == "i32?" + assert str(vortex.dtype.uint(32)) == "u32" + assert str(vortex.dtype.float(16)) == "f16" + assert str(vortex.dtype.bool(nullable=True)) == "bool?" diff --git a/requirements-dev.lock b/requirements-dev.lock index ece7184320..686f367bc6 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -11,8 +11,18 @@ -e file:. -e file:pyvortex +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +asttokens==2.4.1 + # via stack-data babel==2.14.0 # via mkdocs-material + # via pydata-sphinx-theme + # via sphinx +beautifulsoup4==4.12.3 + # via pydata-sphinx-theme bracex==2.4 # via wcmatch certifi==2024.2.2 @@ -23,20 +33,33 @@ click==8.1.7 # via mkdocs colorama==0.4.6 # via mkdocs-material +decorator==5.1.1 + # via ipython +docutils==0.21.2 + # via pydata-sphinx-theme + # via sphinx +executing==2.0.1 + # via stack-data ghp-import==2.1.0 # via mkdocs idna==3.6 # via requests +imagesize==1.4.1 + # via sphinx importlib-metadata==7.1.0 # via mike importlib-resources==6.4.0 # via mike iniconfig==2.0.0 # via pytest +ipython==8.26.0 +jedi==0.19.1 + # via ipython jinja2==3.1.3 # via mike # via mkdocs # via mkdocs-material + # via sphinx markdown==3.6 # via mkdocs # via mkdocs-material @@ -44,6 +67,8 @@ markdown==3.6 markupsafe==2.1.5 # via jinja2 # via mkdocs +matplotlib-inline==0.1.7 + # via ipython maturin==1.5.1 mergedeep==1.3.4 # via mkdocs @@ -57,24 +82,44 @@ mkdocs-material==9.5.17 mkdocs-material-extensions==1.3.1 # via mkdocs-material numpy==1.26.4 + # via pandas # via pyarrow packaging==24.0 # via mkdocs + # via pydata-sphinx-theme # via pytest + # via sphinx paginate==0.5.6 # via mkdocs-material +pandas==2.2.2 +parso==0.8.4 + # via jedi pathspec==0.12.1 # via mkdocs +pexpect==4.9.0 + # via ipython pip==24.0 platformdirs==4.2.0 # via mkdocs pluggy==1.4.0 # via pytest +prompt-toolkit==3.0.47 + # via ipython +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data py-cpuinfo==9.0.0 # via pytest-benchmark pyarrow==15.0.2 +pydata-sphinx-theme==0.15.4 + # via vortex pygments==2.17.2 + # via accessible-pygments + # via ipython # via mkdocs-material + # via pydata-sphinx-theme + # via sphinx pymdown-extensions==10.7.1 # via mkdocs-material pyparsing==3.1.2 @@ -84,6 +129,9 @@ pytest==8.1.1 pytest-benchmark==4.0.0 python-dateutil==2.9.0.post0 # via ghp-import + # via pandas +pytz==2024.1 + # via pandas pyyaml==6.0.1 # via mike # via mkdocs @@ -95,9 +143,39 @@ regex==2023.12.25 # via mkdocs-material requests==2.31.0 # via mkdocs-material + # via sphinx ruff==0.3.5 six==1.16.0 + # via asttokens # via python-dateutil +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.6 + # via beautifulsoup4 +sphinx==8.0.2 + # via pydata-sphinx-theme +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +stack-data==0.6.3 + # via ipython +traitlets==5.14.3 + # via ipython + # via matplotlib-inline +typing-extensions==4.12.2 + # via ipython + # via pydata-sphinx-theme +tzdata==2024.1 + # via pandas urllib3==2.2.1 # via requests verspec==0.1.0 @@ -106,5 +184,7 @@ watchdog==4.0.0 # via mkdocs wcmatch==8.5.1 # via mkdocs-include-markdown-plugin +wcwidth==0.2.13 + # via prompt-toolkit zipp==3.18.1 # via importlib-metadata diff --git a/requirements.lock b/requirements.lock index 64a44c3203..12b45467d6 100644 --- a/requirements.lock +++ b/requirements.lock @@ -11,3 +11,60 @@ -e file:. -e file:pyvortex +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +babel==2.16.0 + # via pydata-sphinx-theme + # via sphinx +beautifulsoup4==4.12.3 + # via pydata-sphinx-theme +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +docutils==0.21.2 + # via pydata-sphinx-theme + # via sphinx +idna==3.8 + # via requests +imagesize==1.4.1 + # via sphinx +jinja2==3.1.4 + # via sphinx +markupsafe==2.1.5 + # via jinja2 +packaging==24.1 + # via pydata-sphinx-theme + # via sphinx +pydata-sphinx-theme==0.15.4 + # via vortex +pygments==2.18.0 + # via accessible-pygments + # via pydata-sphinx-theme + # via sphinx +requests==2.32.3 + # via sphinx +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.6 + # via beautifulsoup4 +sphinx==8.0.2 + # via pydata-sphinx-theme +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +typing-extensions==4.12.2 + # via pydata-sphinx-theme +urllib3==2.2.2 + # via requests diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs index d2e6e05d9c..e6d0486301 100644 --- a/vortex-array/src/array/struct_/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; use vortex_dtype::field::Field; -use vortex_dtype::{DType, FieldName, FieldNames, Nullability, StructDType}; +use vortex_dtype::{DType, FieldName, FieldNames, StructDType}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use crate::stats::{ArrayStatisticsCompute, StatsSet}; @@ -41,6 +41,8 @@ impl StructArray { length: usize, validity: Validity, ) -> VortexResult { + let nullability = validity.nullability(); + if names.len() != fields.len() { vortex_bail!("Got {} names and {} fields", names.len(), fields.len()); } @@ -60,10 +62,7 @@ impl StructArray { } Self::try_from_parts( - DType::Struct( - StructDType::new(names, field_dtypes), - Nullability::NonNullable, - ), + DType::Struct(StructDType::new(names, field_dtypes), nullability), length, StructMetadata { length, diff --git a/vortex-array/src/arrow/dtype.rs b/vortex-array/src/arrow/dtype.rs index 6398e6e37e..b71554bb2b 100644 --- a/vortex-array/src/arrow/dtype.rs +++ b/vortex-array/src/arrow/dtype.rs @@ -46,7 +46,7 @@ impl FromArrowType for DType { .map(|f| Self::from_arrow(f.as_ref())) .collect_vec(), ), - Nullability::NonNullable, + Nullability::NonNullable, // Must match From for Array ) } } diff --git a/vortex-array/src/arrow/recordbatch.rs b/vortex-array/src/arrow/recordbatch.rs index 020988890e..68ce12b5f5 100644 --- a/vortex-array/src/arrow/recordbatch.rs +++ b/vortex-array/src/arrow/recordbatch.rs @@ -24,7 +24,7 @@ impl From for Array { .map(|(array, field)| Array::from_arrow(array.clone(), field.is_nullable())) .collect(), value.num_rows(), - Validity::AllValid, + Validity::NonNullable, // Must match FromArrowType for DType ) .unwrap() .into()