From befac37584101afba2c41037f11b6e6dfb2fe910 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 13 Sep 2024 20:26:54 -0400 Subject: [PATCH] Improve PhysicalExpr and Column documentation (#12457) * Improve PhysicalExpr and Column documentation * Apply suggestions from code review Co-authored-by: Chunchun Ye <14298407+appletreeisyellow@users.noreply.github.com> --------- Co-authored-by: Chunchun Ye <14298407+appletreeisyellow@users.noreply.github.com> --- .../physical-expr-common/src/physical_expr.rs | 23 +++++++++- .../physical-expr/src/expressions/column.rs | 45 ++++++++++++++++--- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 75d300dd0107..a443a65eaa8f 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -31,8 +31,27 @@ use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; use datafusion_expr_common::sort_properties::ExprProperties; -/// See [create_physical_expr](https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html) -/// for examples of creating `PhysicalExpr` from `Expr` +/// [`PhysicalExpr`]s represent expressions such as `A + 1` or `CAST(c1 AS int)`. +/// +/// `PhysicalExpr` knows its type, nullability and can be evaluated directly on +/// a [`RecordBatch`] (see [`Self::evaluate`]). +/// +/// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical +/// planning. They are typically created from [`Expr`] by a [`PhysicalPlanner`] +/// invoked from a higher level API +/// +/// Some important examples of `PhysicalExpr` are: +/// * [`Column`]: Represents a column at a given index in a RecordBatch +/// +/// To create `PhysicalExpr` from `Expr`, see +/// * [`SessionContext::create_physical_expr`]: A high level API +/// * [`create_physical_expr`]: A low level API +/// +/// [`SessionContext::create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.create_physical_expr +/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html +/// [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html +/// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html +/// [`Column`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/expressions/struct.Column.html pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq { /// Returns the physical expression as [`Any`] so that it can be /// downcast to a specific implementation. diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs index 79d15fdb02e8..bf15821bca7a 100644 --- a/datafusion/physical-expr/src/expressions/column.rs +++ b/datafusion/physical-expr/src/expressions/column.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Column expression +//! Physical column reference: [`Column`] use std::any::Any; use std::hash::{Hash, Hasher}; @@ -33,14 +33,48 @@ use datafusion_expr::ColumnarValue; use crate::physical_expr::{down_cast_any_ref, PhysicalExpr}; /// Represents the column at a given index in a RecordBatch +/// +/// This is a physical expression that represents a column at a given index in an +/// arrow [`Schema`] / [`RecordBatch`]. +/// +/// Unlike the [logical `Expr::Column`], this expression is always resolved by schema index, +/// even though it does have a name. This is because the physical plan is always +/// resolved to a specific schema and there is no concept of "relation" +/// +/// # Example: +/// If the schema is `a`, `b`, `c` the `Column` for `b` would be represented by +/// index 1, since `b` is the second colum in the schema. +/// +/// ``` +/// # use datafusion_physical_expr::expressions::Column; +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// // Schema with columns a, b, c +/// let schema = Schema::new(vec![ +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Int32, false), +/// Field::new("c", DataType::Int32, false), +/// ]); +/// +/// // reference to column b is index 1 +/// let column_b = Column::new_with_schema("b", &schema).unwrap(); +/// assert_eq!(column_b.index(), 1); +/// +/// // reference to column c is index 2 +/// let column_c = Column::new_with_schema("c", &schema).unwrap(); +/// assert_eq!(column_c.index(), 2); +/// ``` +/// [logical `Expr::Column`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Column #[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct Column { + /// The name of the column (used for debugging and display purposes) name: String, + /// The index of the column in its schema index: usize, } impl Column { - /// Create a new column expression + /// Create a new column expression which references the + /// column with the given index in the schema. pub fn new(name: &str, index: usize) -> Self { Self { name: name.to_owned(), @@ -48,17 +82,18 @@ impl Column { } } - /// Create a new column expression based on column name and schema + /// Create a new column expression which references the + /// column with the given name in the schema pub fn new_with_schema(name: &str, schema: &Schema) -> Result { Ok(Column::new(name, schema.index_of(name)?)) } - /// Get the column name + /// Get the column's name pub fn name(&self) -> &str { &self.name } - /// Get the column index + /// Get the column's schema index pub fn index(&self) -> usize { self.index }