From 8377a7653029741e534791691cae5ad958dff5c7 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 16 Sep 2024 11:34:10 +0100 Subject: [PATCH] Introduce a new `vortex-schema` crate (#819) The `Schema` struct needs to be accessed from multiple crates to enable more advanced usage (initially - filter re-ordering). --- Cargo.lock | 9 +++++++++ Cargo.toml | 2 ++ pyvortex/Cargo.toml | 4 ++-- pyvortex/src/io.rs | 2 +- vortex-sampling-compressor/Cargo.toml | 2 +- vortex-schema/Cargo.toml | 18 ++++++++++++++++++ .../read/schema.rs => vortex-schema/src/lib.rs | 14 +++++++++++--- .../src/projection.rs | 0 vortex-serde/Cargo.toml | 1 + vortex-serde/src/layouts/read/builder.rs | 2 +- vortex-serde/src/layouts/read/layouts.rs | 2 +- vortex-serde/src/layouts/read/mod.rs | 6 ++---- vortex-serde/src/layouts/read/stream.rs | 4 ++-- 13 files changed, 51 insertions(+), 15 deletions(-) create mode 100644 vortex-schema/Cargo.toml rename vortex-serde/src/layouts/read/schema.rs => vortex-schema/src/lib.rs (75%) rename vortex-serde/src/layouts/read/projections.rs => vortex-schema/src/projection.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 3a6a50e75a..79ed10a6a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4633,6 +4633,14 @@ dependencies = [ "vortex-proto", ] +[[package]] +name = "vortex-schema" +version = "0.8.0" +dependencies = [ + "vortex-dtype", + "vortex-error", +] + [[package]] name = "vortex-serde" version = "0.8.0" @@ -4666,6 +4674,7 @@ dependencies = [ "vortex-flatbuffers", "vortex-sampling-compressor", "vortex-scalar", + "vortex-schema", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 879b1116bf..2539318e23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ members = [ "vortex-proto", "vortex-sampling-compressor", "vortex-scalar", + "vortex-schema", "vortex-serde", "xtask", ] @@ -147,6 +148,7 @@ vortex-roaring = { version = "0.8.0", path = "./encodings/roaring" } vortex-runend = { version = "0.8.0", path = "./encodings/runend" } vortex-runend-bool = { version = "0.8.0", path = "./encodings/runend-bool" } vortex-scalar = { version = "0.8.0", path = "./vortex-scalar", default-features = false } +vortex-schema = { version = "0.8.0", path = "./vortex-schema" } vortex-serde = { version = "0.8.0", path = "./vortex-serde", default-features = false } vortex-sampling-compressor = { version = "0.8.0", path = "./vortex-sampling-compressor" } vortex-zigzag = { version = "0.8.0", path = "./encodings/zigzag" } diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index d7a1dade76..da2e73d8a6 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -24,6 +24,7 @@ doctest = false arrow = { workspace = true, features = ["pyarrow"] } flexbuffers = { workspace = true } futures = { workspace = true } +itertools = { workspace = true } log = { workspace = true } paste = { workspace = true } pyo3 = { workspace = true } @@ -48,11 +49,10 @@ vortex-proto = { workspace = true } vortex-roaring = { workspace = true } vortex-runend = { workspace = true } vortex-runend-bool = { workspace = true } +vortex-sampling-compressor = { workspace = true } vortex-scalar = { workspace = true, default-features = false } vortex-serde = { workspace = true, default-features = false, features = ["tokio"] } -vortex-sampling-compressor = { workspace = true } vortex-zigzag = { workspace = true } -itertools = { workspace = true } # We may need this workaround? # https://pyo3.rs/v0.20.2/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs index 95eee00574..44ebdbe84b 100644 --- a/pyvortex/src/io.rs +++ b/pyvortex/src/io.rs @@ -152,7 +152,7 @@ pub fn read<'py>( let stream = builder.build().await?; - let dtype = stream.schema().into_dtype(); + let dtype = stream.schema().into(); let vecs: Vec = stream.try_collect().await?; diff --git a/vortex-sampling-compressor/Cargo.toml b/vortex-sampling-compressor/Cargo.toml index 56d2b92ddd..fc55c33c70 100644 --- a/vortex-sampling-compressor/Cargo.toml +++ b/vortex-sampling-compressor/Cargo.toml @@ -12,8 +12,8 @@ edition = { workspace = true } rust-version = { workspace = true } [dependencies] -fsst-rs = { workspace = true } arbitrary = { workspace = true, optional = true } +fsst-rs = { workspace = true } lazy_static = { workspace = true } log = { workspace = true } rand = { workspace = true } diff --git a/vortex-schema/Cargo.toml b/vortex-schema/Cargo.toml new file mode 100644 index 0000000000..b213ba0cfa --- /dev/null +++ b/vortex-schema/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "vortex-schema" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[dependencies] +vortex-dtype = { workspace = true } +vortex-error = { workspace = true } + +[lints] +workspace = true diff --git a/vortex-serde/src/layouts/read/schema.rs b/vortex-schema/src/lib.rs similarity index 75% rename from vortex-serde/src/layouts/read/schema.rs rename to vortex-schema/src/lib.rs index 3e30e2a457..98266e8523 100644 --- a/vortex-serde/src/layouts/read/schema.rs +++ b/vortex-schema/src/lib.rs @@ -1,12 +1,18 @@ use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexResult}; -use super::projections::Projection; +use self::projection::Projection; + +pub mod projection; #[derive(Clone, Debug)] pub struct Schema(pub(crate) DType); impl Schema { + pub fn new(schema_dtype: DType) -> Self { + Self(schema_dtype) + } + pub fn project(&self, projection: Projection) -> VortexResult { match projection { Projection::All => Ok(self.clone()), @@ -23,8 +29,10 @@ impl Schema { pub fn dtype(&self) -> &DType { &self.0 } +} - pub fn into_dtype(self) -> DType { - self.0 +impl From for DType { + fn from(value: Schema) -> Self { + value.0 } } diff --git a/vortex-serde/src/layouts/read/projections.rs b/vortex-schema/src/projection.rs similarity index 100% rename from vortex-serde/src/layouts/read/projections.rs rename to vortex-schema/src/projection.rs diff --git a/vortex-serde/Cargo.toml b/vortex-serde/Cargo.toml index b8429d1f6e..b6a7d69ba6 100644 --- a/vortex-serde/Cargo.toml +++ b/vortex-serde/Cargo.toml @@ -29,6 +29,7 @@ vortex-error = { workspace = true, features = ["object_store"] } vortex-expr = { workspace = true } vortex-flatbuffers = { workspace = true, features = ["file"] } vortex-scalar = { workspace = true, features = ["flatbuffers"] } +vortex-schema = { workspace = true } [dev-dependencies] arrow = { workspace = true } diff --git a/vortex-serde/src/layouts/read/builder.rs b/vortex-serde/src/layouts/read/builder.rs index ad85c84b8c..ccbfd56f02 100644 --- a/vortex-serde/src/layouts/read/builder.rs +++ b/vortex-serde/src/layouts/read/builder.rs @@ -5,13 +5,13 @@ use bytes::BytesMut; use vortex::{Array, ArrayDType}; use vortex_dtype::field::Field; use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::projection::Projection; use crate::io::VortexReadAt; use crate::layouts::read::cache::{LayoutMessageCache, RelativeLayoutCache}; use crate::layouts::read::context::LayoutDeserializer; use crate::layouts::read::filtering::RowFilter; use crate::layouts::read::footer::Footer; -use crate::layouts::read::projections::Projection; use crate::layouts::read::stream::LayoutBatchStream; use crate::layouts::read::{Scan, DEFAULT_BATCH_SIZE, FILE_POSTSCRIPT_SIZE, INITIAL_READ_SIZE}; use crate::layouts::MAGIC_BYTES; diff --git a/vortex-serde/src/layouts/read/layouts.rs b/vortex-serde/src/layouts/read/layouts.rs index 0e3d556c68..23087a0b41 100644 --- a/vortex-serde/src/layouts/read/layouts.rs +++ b/vortex-serde/src/layouts/read/layouts.rs @@ -8,8 +8,8 @@ use vortex_dtype::field::Field; use vortex_dtype::DType; use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult}; use vortex_flatbuffers::footer as fb; +use vortex_schema::projection::Projection; -use super::projections::Projection; use crate::layouts::read::batch::BatchReader; use crate::layouts::read::buffered::BufferedReader; use crate::layouts::read::cache::RelativeLayoutCache; diff --git a/vortex-serde/src/layouts/read/mod.rs b/vortex-serde/src/layouts/read/mod.rs index 817620ad82..c62a29db48 100644 --- a/vortex-serde/src/layouts/read/mod.rs +++ b/vortex-serde/src/layouts/read/mod.rs @@ -12,16 +12,14 @@ mod context; mod filtering; mod footer; mod layouts; -mod projections; -mod schema; mod stream; pub use builder::LayoutReaderBuilder; pub use context::*; pub use filtering::RowFilter; -pub use projections::Projection; -pub use schema::Schema; pub use stream::LayoutBatchStream; +pub use vortex_schema::projection::Projection; +pub use vortex_schema::Schema; use crate::stream_writer::ByteRange; diff --git a/vortex-serde/src/layouts/read/stream.rs b/vortex-serde/src/layouts/read/stream.rs index 9b15395470..dd09bec253 100644 --- a/vortex-serde/src/layouts/read/stream.rs +++ b/vortex-serde/src/layouts/read/stream.rs @@ -15,10 +15,10 @@ use vortex::{Array, IntoArray, IntoArrayVariant}; use vortex_dtype::{match_each_integer_ptype, DType}; use vortex_error::{vortex_err, vortex_panic, VortexError, VortexResult}; use vortex_scalar::Scalar; +use vortex_schema::Schema; use crate::io::VortexReadAt; use crate::layouts::read::cache::LayoutMessageCache; -use crate::layouts::read::schema::Schema; use crate::layouts::read::{Layout, MessageId, ReadResult, Scan}; use crate::layouts::Projection; use crate::stream_writer::ByteRange; @@ -56,7 +56,7 @@ impl LayoutBatchStream { } pub fn schema(&self) -> Schema { - Schema(self.dtype.clone()) + Schema::new(self.dtype.clone()) } // TODO(robert): Push this logic down to layouts