From 9317fe5eeffd618df695a5cbdc2756648a8fca58 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 15 Nov 2024 08:01:33 -0500 Subject: [PATCH] fix: Support selecting columns from interchange-level narwhals DataFrames (#544) * Use select in narwhals to support duckdb/ibis * drop patch * add clear_cache runtime method * fix --- pixi.lock | 84 +-- pixi.toml | 2 +- vegafusion-core/src/lib.rs | 1 - vegafusion-core/src/patch.rs | 589 ------------------- vegafusion-python/src/lib.rs | 20 - vegafusion-python/tests/test_pretransform.py | 28 + vegafusion-python/vegafusion/runtime.py | 66 +-- 7 files changed, 98 insertions(+), 692 deletions(-) delete mode 100644 vegafusion-core/src/patch.rs diff --git a/pixi.lock b/pixi.lock index b7ddc943..0681d929 100644 --- a/pixi.lock +++ b/pixi.lock @@ -247,7 +247,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.10.15-h4a871b0_2_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-duckdb-1.0.0-py310hea249c9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-duckdb-1.1.3-py310hf71b8c6_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-fastjsonschema-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.2-pyhd8ed1ab_0.conda @@ -552,7 +552,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.10.15-hd8744da_2_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/python-duckdb-1.0.0-py310he0a0c5d_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/python-duckdb-1.1.3-py310h6954a95_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-fastjsonschema-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.2-pyhd8ed1ab_0.conda @@ -870,7 +870,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.10.15-hdce6c4c_2_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-duckdb-1.0.0-py310hcf9f62a_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-duckdb-1.1.3-py310h853098b_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-fastjsonschema-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.2-pyhd8ed1ab_0.conda @@ -1143,7 +1143,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.10.15-hfaddaf0_2_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/python-duckdb-1.0.0-py310h9e98ed7_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/python-duckdb-1.1.3-py310h9e98ed7_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-fastjsonschema-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.2-pyhd8ed1ab_0.conda @@ -11922,35 +11922,34 @@ packages: timestamp: 1709299922152 - kind: conda name: python-duckdb - version: 1.0.0 - build: py310h9e98ed7_0 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/python-duckdb-1.0.0-py310h9e98ed7_0.conda - sha256: 23c2abb0018fdd2ee8176b33ac8eac48b6094a219b971c5fdc702285785aa4cd - md5: cae7ec224c706014f6e1568b3cf1cc96 + version: 1.1.3 + build: py310h6954a95_0 + subdir: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/python-duckdb-1.1.3-py310h6954a95_0.conda + sha256: 82fcd7656df4175101ee36fe3370bbf7cdac33b3d6375fa9178267edc03dafc3 + md5: e3574a2a0b1e85c0fa25eac37555c884 depends: + - __osx >=10.13 + - libcxx >=18 - python >=3.10,<3.11.0a0 - python_abi 3.10.* *_cp310 - - ucrt >=10.0.20348.0 - - vc >=14.2,<15 - - vc14_runtime >=14.29.30139 license: MIT license_family: MIT purls: - pkg:pypi/duckdb?source=hash-mapping - size: 15638825 - timestamp: 1717687118745 + size: 22035457 + timestamp: 1730799829176 - kind: conda name: python-duckdb - version: 1.0.0 - build: py310hcf9f62a_0 + version: 1.1.3 + build: py310h853098b_0 subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/python-duckdb-1.0.0-py310hcf9f62a_0.conda - sha256: 720fdd1e1a34bafc4e5b671c4ab722d2953d09563ca2a4520bb6fb450510fa34 - md5: ff23b03d25d3614a05e91d94036b94b8 + url: https://conda.anaconda.org/conda-forge/osx-arm64/python-duckdb-1.1.3-py310h853098b_0.conda + sha256: e0317a7dbfb44979d4ecbfc45454f60dc4a23e75c674fa821f216b500f8bd246 + md5: 13252069330a0b39712f8cde0cc7a22f depends: - __osx >=11.0 - - libcxx >=16 + - libcxx >=18 - python >=3.10,<3.11.0a0 - python >=3.10,<3.11.0a0 *_cpython - python_abi 3.10.* *_cp310 @@ -11958,47 +11957,48 @@ packages: license_family: MIT purls: - pkg:pypi/duckdb?source=hash-mapping - size: 18599847 - timestamp: 1717686407221 + size: 20398771 + timestamp: 1730799955032 - kind: conda name: python-duckdb - version: 1.0.0 - build: py310he0a0c5d_0 - subdir: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/python-duckdb-1.0.0-py310he0a0c5d_0.conda - sha256: 3dd1abaa03cb511588c848b74ffdd817f576f259f5d42ad76c77358277c8ae5a - md5: 2c7fa91f1a5f57a72b1aec7e25f0a169 + version: 1.1.3 + build: py310h9e98ed7_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/python-duckdb-1.1.3-py310h9e98ed7_0.conda + sha256: b4fe5816af8a982a7d59caa3ccbacae5f84c1eab084ce45242b86ce907338738 + md5: e03cc4a182c117ec9330e1545792427c depends: - - __osx >=10.13 - - libcxx >=16 - python >=3.10,<3.11.0a0 - python_abi 3.10.* *_cp310 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 license: MIT license_family: MIT purls: - pkg:pypi/duckdb?source=hash-mapping - size: 20190347 - timestamp: 1717686142652 + size: 16945750 + timestamp: 1730800695228 - kind: conda name: python-duckdb - version: 1.0.0 - build: py310hea249c9_0 + version: 1.1.3 + build: py310hf71b8c6_0 subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/python-duckdb-1.0.0-py310hea249c9_0.conda - sha256: c85731fcd95eba6459f74c675dc6ea6a4ec31ab09607d4bb4316c701690cec20 - md5: 630bef971bd14f61afa83422425d7f95 + url: https://conda.anaconda.org/conda-forge/linux-64/python-duckdb-1.1.3-py310hf71b8c6_0.conda + sha256: 0af8fabe998df691b317d7ac0455b30d6d64ce240e5bcedeb16b9ae8cf3a1533 + md5: 1c07ae5896495839be2503ef31628c5f depends: - __glibc >=2.17,<3.0.a0 - - libgcc-ng >=12 - - libstdcxx-ng >=12 + - libgcc >=13 + - libstdcxx >=13 - python >=3.10,<3.11.0a0 - python_abi 3.10.* *_cp310 license: MIT license_family: MIT purls: - pkg:pypi/duckdb?source=hash-mapping - size: 22769349 - timestamp: 1717686625369 + size: 24267398 + timestamp: 1730800229394 - kind: conda name: python-fastjsonschema version: 2.20.0 diff --git a/pixi.toml b/pixi.toml index 711698f3..0e69a9f6 100644 --- a/pixi.toml +++ b/pixi.toml @@ -130,7 +130,7 @@ selenium = "4.11.2.*" toml = "0.10.2.*" pytest = ">=4.6" click = "8.1.6.*" -python-duckdb = "1.0" +python-duckdb = ">=1.1.3,<2" pip = "23.2.1.*" voila = "0.5.0.*" tenacity = "8.2.3.*" diff --git a/vegafusion-core/src/lib.rs b/vegafusion-core/src/lib.rs index 0597a424..639bb50b 100644 --- a/vegafusion-core/src/lib.rs +++ b/vegafusion-core/src/lib.rs @@ -5,7 +5,6 @@ extern crate core; pub mod chart_state; pub mod data; pub mod expression; -pub mod patch; pub mod planning; pub mod proto; pub mod runtime; diff --git a/vegafusion-core/src/patch.rs b/vegafusion-core/src/patch.rs deleted file mode 100644 index 59875505..00000000 --- a/vegafusion-core/src/patch.rs +++ /dev/null @@ -1,589 +0,0 @@ -use crate::error::Result; -use crate::planning::plan::{PlannerConfig, SpecPlan}; -use crate::spec::chart::{ChartSpec, ChartVisitor}; -use crate::spec::data::DataSpec; -use crate::spec::values::StringOrSignalSpec; -use itertools::Itertools; -use json_patch::{diff, patch, PatchOperation}; -use serde_json::Value; - -/// Attempt to apply the difference between two vega specs to a third pre-transformed spec -pub fn patch_pre_transformed_spec( - spec1: &ChartSpec, - pre_transformed_spec1: &ChartSpec, - spec2: &ChartSpec, -) -> Result> { - // Run spec1 and spec2 through the client portion of the pre_transform_spec logic. - // This performs domain splitting and introduces projection pushdown transforms. - let planner_config = PlannerConfig { - extract_server_data: false, - ..Default::default() - }; - let plan1 = SpecPlan::try_new(spec1, &planner_config)?; - let planned_spec1 = plan1.client_spec; - - let plan2 = SpecPlan::try_new(spec2, &planner_config)?; - let planned_spec2 = plan2.client_spec; - - // Diff the planned specs to create patch between them - let spec_patch = diff( - &serde_json::to_value(planned_spec1)?, - &serde_json::to_value(planned_spec2)?, - ); - - // Do not apply patch if there are changes to any datasets - for patch_op in &spec_patch.0 { - let path = match patch_op { - PatchOperation::Add(op) => &op.path, - PatchOperation::Remove(op) => &op.path, - PatchOperation::Replace(op) => &op.path, - PatchOperation::Move(op) => &op.path, - PatchOperation::Copy(op) => &op.path, - PatchOperation::Test(op) => &op.path, - }; - if path.contains("/data/") { - return Ok(None); - } - } - - // Attempt to apply patch to pre-transformed spec - let mut pre_transformed_spec2 = serde_json::to_value(pre_transformed_spec1)?; - if patch(&mut pre_transformed_spec2, spec_patch.0.as_slice()).is_err() { - // Patch failed to apply, return None - Ok(None) - } else { - // Convert objects with index keys (like {"0": "foo", "1": "bar"}) - // to arrays (like ["foo", "bar"]) - let pre_transformed_spec2 = arrayify_int_key_objects(&pre_transformed_spec2); - - // Patch applied successfully, check validity - let pre_transformed_spec2: ChartSpec = serde_json::from_value(pre_transformed_spec2)?; - - // Check for presence of inline dataset URLs, this indicates an invalid pre transformed spec - let mut visitor = AnyInlineDatasetUrlsVisitor::new(); - pre_transformed_spec2.walk(&mut visitor)?; - if visitor.any_inline_dataset_urls { - return Ok(None); - } - - Ok(Some(pre_transformed_spec2)) - } -} - -/// When replacing an object with an array, the patched chart will sometimes end up with -/// objects of the form {"0": "A", "1": "BB"}. This function identifies such objects and converts -/// them into arrays (["A", "BB"]). -fn arrayify_int_key_objects(obj: &Value) -> Value { - match obj { - Value::Object(map) => { - if !map.is_empty() && map.keys().all(|k| k.parse::().is_ok()) { - // Object is not empty and all keys unsigned integer strings - let indices: Vec<_> = map - .keys() - .map(|k| k.parse::().unwrap()) - .sorted() - .collect(); - let max_index = indices[indices.len() - 1]; - let mut new_array = vec![Value::Null; max_index + 1]; - for idx in indices { - new_array[idx] = arrayify_int_key_objects(&map[&idx.to_string()]); - } - Value::Array(new_array) - } else { - // Recurse into the sub-object otherwise - let mut new_map = serde_json::Map::new(); - for (k, v) in map { - new_map.insert(k.clone(), arrayify_int_key_objects(v)); - } - Value::Object(new_map) - } - } - Value::Array(arr) => Value::Array(arr.iter().map(arrayify_int_key_objects).collect()), - _ => obj.clone(), - } -} - -struct AnyInlineDatasetUrlsVisitor { - pub any_inline_dataset_urls: bool, -} - -impl AnyInlineDatasetUrlsVisitor { - pub fn new() -> Self { - Self { - any_inline_dataset_urls: false, - } - } -} - -impl ChartVisitor for AnyInlineDatasetUrlsVisitor { - fn visit_data(&mut self, data: &DataSpec, _scope: &[u32]) -> Result<()> { - if let Some(StringOrSignalSpec::String(url)) = &data.url { - if url.starts_with("table://") || url.starts_with("vegafusion+dataset://") { - self.any_inline_dataset_urls = true; - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::patch::patch_pre_transformed_spec; - use crate::spec::chart::ChartSpec; - use serde_json::{json, Value}; - - fn histogram(fill: Value, max_bins: u32, add_dataset: bool) -> ChartSpec { - let mut chart_spec: ChartSpec = serde_json::from_value(json!( - { - "$schema": "https://vega.github.io/schema/vega/v5.json", - "background": "white", - "padding": 5, - "width": 200, - "height": 200, - "style": "cell", - "data": [ - { - "name": "source_0", - "url": "data/movies.json", - "format": {"type": "json"}, - "transform": [ - { - "type": "extent", - "field": "IMDB Rating", - "signal": "bin_maxbins_10_IMDB_Rating_extent" - }, - { - "type": "bin", - "field": "IMDB Rating", - "as": [ - "bin_maxbins_10_IMDB Rating", - "bin_maxbins_10_IMDB Rating_end" - ], - "signal": "bin_maxbins_10_IMDB_Rating_bins", - "extent": {"signal": "bin_maxbins_10_IMDB_Rating_extent"}, - "maxbins": max_bins - }, - { - "type": "aggregate", - "groupby": [ - "bin_maxbins_10_IMDB Rating", - "bin_maxbins_10_IMDB Rating_end" - ], - "ops": ["count"], - "fields": [null], - "as": ["__count"] - }, - { - "type": "filter", - "expr": "isValid(datum[\"bin_maxbins_10_IMDB Rating\"]) && isFinite(+datum[\"bin_maxbins_10_IMDB Rating\"])" - } - ] - } - ], - "marks": [ - { - "name": "marks", - "type": "rect", - "style": ["bar"], - "from": {"data": "source_0"}, - "encode": { - "update": { - "fill": fill, - "ariaRoleDescription": {"value": "bar"}, - "x2": { - "scale": "x", - "field": "bin_maxbins_10_IMDB Rating", - "offset": 1 - }, - "x": {"scale": "x", "field": "bin_maxbins_10_IMDB Rating_end"}, - "y": {"scale": "y", "field": "__count"}, - "y2": {"scale": "y", "value": 0} - } - } - } - ], - "scales": [ - { - "name": "x", - "type": "linear", - "domain": { - "signal": "[bin_maxbins_10_IMDB_Rating_bins.start, bin_maxbins_10_IMDB_Rating_bins.stop]" - }, - "range": [0, {"signal": "width"}], - "bins": {"signal": "bin_maxbins_10_IMDB_Rating_bins"}, - "zero": false - }, - { - "name": "y", - "type": "linear", - "domain": {"data": "source_0", "field": "__count"}, - "range": [{"signal": "height"}, 0], - "nice": true, - "zero": true - } - ], - "axes": [ - { - "scale": "y", - "orient": "left", - "gridScale": "x", - "grid": true, - "tickCount": {"signal": "ceil(height/40)"}, - "domain": false, - "labels": false, - "aria": false, - "maxExtent": 0, - "minExtent": 0, - "ticks": false, - "zindex": 0 - }, - { - "scale": "x", - "orient": "bottom", - "grid": false, - "title": "IMDB Rating (binned)", - "labelFlush": true, - "labelOverlap": true, - "tickCount": {"signal": "ceil(width/10)"}, - "zindex": 0 - }, - { - "scale": "y", - "orient": "left", - "grid": false, - "title": "Count of Records", - "labelOverlap": true, - "tickCount": {"signal": "ceil(height/40)"}, - "zindex": 0 - } - ] - } - )).unwrap(); - - if add_dataset { - chart_spec.data.push( - serde_json::from_value(json!( - { - "name": "added_dataset", - "values": [{"a": 1, "b": 2}] - } - )) - .unwrap(), - ) - } - - chart_spec - } - - fn pre_transformed_histogram(fill: Value) -> ChartSpec { - serde_json::from_value(json!( - { - "$schema": "https://vega.github.io/schema/vega/v5.json", - "data": [ - { - "name": "source_0", - "values": [ - { - "__count": 985, - "bin_maxbins_10_IMDB Rating": 6.0, - "bin_maxbins_10_IMDB Rating_end": 7.0 - }, - { - "__count": 100, - "bin_maxbins_10_IMDB Rating": 3.0, - "bin_maxbins_10_IMDB Rating_end": 4.0 - }, - { - "__count": 741, - "bin_maxbins_10_IMDB Rating": 7.0, - "bin_maxbins_10_IMDB Rating_end": 8.0 - }, - { - "__count": 633, - "bin_maxbins_10_IMDB Rating": 5.0, - "bin_maxbins_10_IMDB Rating_end": 6.0 - }, - { - "__count": 204, - "bin_maxbins_10_IMDB Rating": 8.0, - "bin_maxbins_10_IMDB Rating_end": 9.0 - }, - { - "__count": 43, - "bin_maxbins_10_IMDB Rating": 2.0, - "bin_maxbins_10_IMDB Rating_end": 3.0 - }, - { - "__count": 273, - "bin_maxbins_10_IMDB Rating": 4.0, - "bin_maxbins_10_IMDB Rating_end": 5.0 - }, - { - "__count": 4, - "bin_maxbins_10_IMDB Rating": 9.0, - "bin_maxbins_10_IMDB Rating_end": 10.0 - }, - { - "__count": 5, - "bin_maxbins_10_IMDB Rating": 1.0, - "bin_maxbins_10_IMDB Rating_end": 2.0 - } - ] - }, - { - "name": "source_0_y_domain___count", - "values": [ - { - "min": 4, - "max": 985 - } - ] - } - ], - "signals": [ - { - "name": "bin_maxbins_10_IMDB_Rating_bins", - "value": { - "fields": [ - "IMDB Rating" - ], - "fname": "bin_IMDB Rating", - "start": 1.0, - "step": 1.0, - "stop": 10.0 - } - } - ], - "marks": [ - { - "type": "rect", - "name": "marks", - "from": { - "data": "source_0" - }, - "encode": { - "update": { - "x": { - "field": "bin_maxbins_10_IMDB Rating_end", - "scale": "x" - }, - "ariaRoleDescription": { - "value": "bar" - }, - "y": { - "field": "__count", - "scale": "y" - }, - "y2": { - "value": 0, - "scale": "y" - }, - "fill": fill, - "x2": { - "field": "bin_maxbins_10_IMDB Rating", - "scale": "x", - "offset": 1 - } - } - }, - "style": [ - "bar" - ] - } - ], - "scales": [ - { - "name": "x", - "type": "linear", - "domain": { - "signal": "[bin_maxbins_10_IMDB_Rating_bins.start, bin_maxbins_10_IMDB_Rating_bins.stop]" - }, - "range": [ - 0, - { - "signal": "width" - } - ], - "bins": { - "signal": "bin_maxbins_10_IMDB_Rating_bins" - }, - "zero": false - }, - { - "name": "y", - "type": "linear", - "domain": [ - { - "signal": "(data(\"source_0_y_domain___count\")[0] || {}).min" - }, - { - "signal": "(data(\"source_0_y_domain___count\")[0] || {}).max" - } - ], - "range": [ - { - "signal": "height" - }, - 0 - ], - "zero": true, - "nice": true - } - ], - "axes": [ - { - "scale": "y", - "grid": true, - "gridScale": "x", - "labels": false, - "minExtent": 0, - "tickCount": { - "signal": "ceil(height/40)" - }, - "maxExtent": 0, - "domain": false, - "zindex": 0, - "aria": false, - "ticks": false, - "orient": "left" - }, - { - "scale": "x", - "orient": "bottom", - "labelOverlap": true, - "grid": false, - "title": "IMDB Rating (binned)", - "labelFlush": true, - "tickCount": { - "signal": "ceil(width/10)" - }, - "zindex": 0 - }, - { - "scale": "y", - "zindex": 0, - "grid": false, - "labelOverlap": true, - "tickCount": { - "signal": "ceil(height/40)" - }, - "orient": "left", - "title": "Count of Records" - } - ], - "height": 200, - "style": "cell", - "background": "white", - "padding": 5, - "width": 200 - } - )).unwrap() - } - - #[test] - fn test_patch_color_succeeds() { - let spec1: ChartSpec = histogram(json!({"value": "blue"}), 10, false); - - let spec2: ChartSpec = histogram(json!({"value": "red"}), 10, false); - - let pre_transformed_spec1: ChartSpec = pre_transformed_histogram(json!({"value": "blue"})); - - let pre_transform_spec2 = - patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2) - .expect("Expected patch_pre_transformed_spec to succeed") - .expect("Expected patch_pre_transformed_spec to return Some"); - - assert_eq!( - pre_transform_spec2, - pre_transformed_histogram(json!({"value": "red"})) - ) - } - - #[test] - fn test_patch_color_array_succeeds() { - // Replace an object with an array of objects - let fill_value = json!({"value": "blue"}); - let fill_array = json!([{"test": "2 < 3", "value": "red"}, {"value": "green"}]); - - let spec1: ChartSpec = histogram(fill_value.clone(), 10, false); - let spec2: ChartSpec = histogram(fill_array.clone(), 10, false); - let pre_transformed_spec1: ChartSpec = pre_transformed_histogram(fill_value); - - let pre_transform_spec2 = - patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2) - .expect("Expected patch_pre_transformed_spec to succeed") - .expect("Expected patch_pre_transformed_spec to return Some"); - - let encode_spec = pre_transform_spec2.marks[0].encode.clone().unwrap(); - let update_spec = encode_spec.encodings.get("update").unwrap(); - let fill = update_spec.channels.get("fill").unwrap(); - let fill_str = serde_json::to_string(fill).unwrap(); - assert_eq!( - fill_str, - r#"[{"value":"red","test":"2 < 3"},{"value":"green"}]"# - ); - } - - #[test] - fn test_patch_max_bins_fails() { - let spec1: ChartSpec = histogram(json!({"value": "blue"}), 10, false); - - let spec2: ChartSpec = histogram(json!({"value": "blue"}), 20, false); - - let pre_transformed_spec1: ChartSpec = pre_transformed_histogram(json!({"value": "blue"})); - - let pre_transform_spec2 = - patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2) - .expect("Expected patch_pre_transformed_spec to succeed"); - assert!(pre_transform_spec2.is_none()); - } - - #[test] - fn test_patch_adds_inline_dataset() { - let spec1: ChartSpec = serde_json::from_value(json!( - { - "data": [ - { - "name": "data1", - "url": "something.csv" - } - ] - } - )) - .unwrap(); - - let spec2: ChartSpec = serde_json::from_value(json!( - { - "data": [ - { - "name": "data1", - "url": "table://something" - } - ] - } - )) - .unwrap(); - - let pre_transformed_spec1 = spec1.clone(); - - let pre_transform_spec2 = - patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2) - .expect("Expected patch_pre_transformed_spec to succeed"); - - assert!(pre_transform_spec2.is_none()); - } - - #[test] - fn test_patch_add_dataset_fails() { - let spec1: ChartSpec = histogram(json!({"value": "blue"}), 10, false); - - let spec2: ChartSpec = histogram(json!({"value": "red"}), 10, true); - - let pre_transformed_spec1: ChartSpec = pre_transformed_histogram(json!({"value": "blue"})); - - let pre_transform_spec2 = - patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2) - .expect("Expected patch_pre_transformed_spec to succeed"); - assert!(pre_transform_spec2.is_none()); - } -} diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 8a553409..540dde51 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -25,7 +25,6 @@ use pythonize::{depythonize, pythonize}; use serde_json::json; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_core::data::dataset::VegaFusionDataset; -use vegafusion_core::patch::patch_pre_transformed_spec; use vegafusion_core::planning::plan::{PlannerConfig, PreTransformSpecWarningSpec, SpecPlan}; use vegafusion_core::planning::projection_pushdown::get_column_usage as rs_get_column_usage; use vegafusion_core::planning::watch::{ExportUpdateJSON, WatchPlan}; @@ -504,25 +503,6 @@ impl PyVegaFusionRuntime { }) } - pub fn patch_pre_transformed_spec( - &self, - spec1: PyObject, - pre_transformed_spec1: PyObject, - spec2: PyObject, - ) -> PyResult> { - let spec1 = parse_json_spec(spec1)?; - let pre_transformed_spec1 = parse_json_spec(pre_transformed_spec1)?; - let spec2 = parse_json_spec(spec2)?; - Python::with_gil(|py| { - match patch_pre_transformed_spec(&spec1, &pre_transformed_spec1, &spec2)? { - None => Ok(None), - Some(pre_transformed_spec2) => Ok(Some( - pythonize::pythonize(py, &pre_transformed_spec2)?.into(), - )), - } - }) - } - pub fn clear_cache(&self) -> PyResult<()> { if let Some(runtime) = self.runtime.as_any().downcast_ref::() { self.tokio_runtime.block_on(runtime.clear_cache()); diff --git a/vegafusion-python/tests/test_pretransform.py b/vegafusion-python/tests/test_pretransform.py index df326685..e9c698ea 100644 --- a/vegafusion-python/tests/test_pretransform.py +++ b/vegafusion-python/tests/test_pretransform.py @@ -10,6 +10,7 @@ import pyarrow as pa import pytest from pandas import NaT, Timestamp +import duckdb import vegafusion as vf @@ -1634,6 +1635,33 @@ def test_pre_transform_dataset_dataframe_interface_protocol(): assert_frame_equal(result, expected) +def test_pre_transform_dataset_duckdb(): + n = 4050 + # Input a polars DataFrame (which follows the DataFrame Interface Protocol) + order_items = pl.DataFrame({"menu_item": [0] * n + [1] * (2 * n) + [2] * (3 * n)}) # noqa: F841 + with duckdb.connect(":memory:") as con: + con.register("order_items", order_items) + rel = con.sql("SELECT * FROM order_items") + + vega_spec = order_items_spec() + datasets, warnings = vf.runtime.pre_transform_datasets( + vega_spec, + ["data_0"], + inline_datasets={ + "order_items": rel, + }, + ) + assert len(warnings) == 0 + assert len(datasets) == 1 + + result = datasets[0] + + # Result should be a pandas DataFrame + assert isinstance(result, pd.DataFrame) + expected = pd.DataFrame({"menu_item": [0, 1, 2], "__count": [n, 2 * n, 3 * n]}) + pd.testing.assert_frame_equal(result, expected) + + def test_gh_268_hang(): """ Tests for hang reported in https://github.com/hex-inc/vegafusion/issues/268 diff --git a/vegafusion-python/vegafusion/runtime.py b/vegafusion-python/vegafusion/runtime.py index a010537e..63e4b50c 100644 --- a/vegafusion-python/vegafusion/runtime.py +++ b/vegafusion-python/vegafusion/runtime.py @@ -29,12 +29,16 @@ UnaryUnaryMultiCallable = Any -def _get_common_namespace(inline_datasets: dict[str, Any] | None) -> ModuleType | None: +def _get_common_full_namespace( + inline_datasets: dict[str, Any] | None, +) -> ModuleType | None: namespaces: set[ModuleType] = set() try: if inline_datasets is not None: for df in inline_datasets.values(): - namespaces.add(nw.get_native_namespace(nw.from_native(df))) + nw_df = nw.from_native(df) + if nw.get_level(nw_df) == "full": + namespaces.add(nw.get_native_namespace(nw_df)) if len(namespaces) == 1: return next(iter(namespaces)) @@ -295,8 +299,16 @@ def _import_inline_datasets( # Project down columns if possible if columns is not None: - # TODO: Nice error message when column is not found - df_nw = df_nw[columns] # type: ignore[index] + missing_col = [ + col for col in columns if col not in df_nw.columns + ] + if missing_col: + msg = ( + "Columns found in chart spec but not in DataFrame: " + f"{missing_col}" + ) + raise ValueError(msg) + df_nw = df_nw.select(columns) imported_inline_datasets[name] = Table(df_nw) # type: ignore[arg-type] except TypeError: @@ -557,7 +569,7 @@ def normalize_timezones( datasets = values else: raise ValueError(f"Unrecognized dataset_format: {dataset_format}") - elif (namespace := _get_common_namespace(inline_datasets)) is not None: + elif (namespace := _get_common_full_namespace(inline_datasets)) is not None: # Infer the type from the inline datasets datasets = normalize_timezones( [nw.from_arrow(value, native_namespace=namespace) for value in values] @@ -691,40 +703,6 @@ def pre_transform_extract( return new_spec, datasets, warnings - def patch_pre_transformed_spec( - self, - spec1: dict[str, Any] | str, - pre_transformed_spec1: dict[str, Any] | str, - spec2: dict[str, Any] | str, - ) -> dict[str, Any] | None: - """ - Attempt to patch a Vega spec returned by the pre_transform_spec method. - - This method tries to patch a Vega spec without rerunning the pre_transform_spec - logic. When possible, this can be significantly faster than rerunning the - pre_transform_spec method. - - Args: - spec1: The input Vega spec to a prior call to pre_transform_spec. - pre_transformed_spec1: The prior result of passing spec1 to - pre_transform_spec. - spec2: A Vega spec that is assumed to be a small delta compared to spec1. - - Returns: - If the delta between spec1 and spec2 is in the portions of spec1 that were - not modified by pre_transform_spec, then this delta can be applied cleanly - to pre_transform_spec1 and the result is returned. If the delta cannot be - applied cleanly, None is returned and spec2 should be passed through the - pre_transform_spec method. - """ - if self.using_grpc: - raise ValueError("patch_pre_transformed_spec not yet supported over gRPC") - else: - pre_transformed_spec2 = self.runtime.patch_pre_transformed_spec( - spec1, pre_transformed_spec1, spec2 - ) - return cast(dict[str, Any], pre_transformed_spec2) - @property def worker_threads(self) -> int | None: """ @@ -808,10 +786,20 @@ def cache_capacity(self, value: int) -> None: self.reset() def reset(self) -> None: + """ + Restart the runtime + """ if self._runtime is not None: self._runtime.clear_cache() self._runtime = None + def clear_cache(self) -> None: + """ + Clear the VegaFusion runtime cache + """ + if self._runtime is not None: + self._runtime.clear_cache() + def __repr__(self) -> str: if self.using_grpc: return f"VegaFusionRuntime(url={self._grpc_url})"