diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 14d4b41..6561a2d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -15,7 +15,7 @@ jobs: matrix: version: - '1' - - '1.3' + - '1.6' os: - ubuntu-latest arch: diff --git a/Project.toml b/Project.toml index fe30abd..6121013 100644 --- a/Project.toml +++ b/Project.toml @@ -1,21 +1,23 @@ name = "Legolas" uuid = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" authors = ["Beacon Biosignals, Inc."] -version = "0.4.0" +version = "0.5.0" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] Arrow = "2" DataFrames = "1" Tables = "1.4" -julia = "1.3" +julia = "1.6" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [targets] -test = ["Test", "DataFrames"] +test = ["Test", "DataFrames", "UUIDs"] diff --git a/docs/make.jl b/docs/make.jl index e81cb9f..da57a99 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -5,8 +5,8 @@ makedocs(modules=[Legolas], sitename="Legolas", authors="Beacon Biosignals, Inc.", pages=["API Documentation" => "index.md", - "Tips For Schema Authors" => "schema.md", - "Legolas Table Specification" => "specification.md", + "Schema-Related Concepts/Conventions" => "schema-concepts.md", + "Arrow-Related Concepts/Conventions" => "arrow-concepts.md", "FAQ" => "faq.md"]) deploydocs(repo="github.com/beacon-biosignals/Legolas.jl.git", diff --git a/docs/src/arrow-concepts.md b/docs/src/arrow-concepts.md new file mode 100644 index 0000000..01d280d --- /dev/null +++ b/docs/src/arrow-concepts.md @@ -0,0 +1,19 @@ +# Arrow-Related Concepts/Conventions + +!!! note + + If you're a newcomer to Legolas.jl, please familiarize yourself with the [tour](https://github.com/beacon-biosignals/Legolas.jl/blob/main/examples/tour.jl) before diving into this documentation. + +Legolas.jl's target (de)serialization format, [Arrow](https://arrow.apache.org/), already features wide cross-language adoption, enabling Legolas-serialized tables to be seamlessly read into many non-Julia environments. This documentation section contains conventions related to Legolas-serialized Arrow tables that may be observable by generic Legolas-unaware Arrow consumers. + +## Supporting Legolas Schema Discovery In Arrow Tables + +Legolas defines a special field `legolas_schema_qualified` that Legolas-aware Arrow writers may include in an Arrow table's table-level metadata to indicate a particular Legolas schema with which the table complies. + +Arrow tables which include this field are considered to "support Legolas schema discovery" and are referred to as "Legolas-discoverable", since Legolas consumers may employ this field to automatically match the table against available application-layer Legolas schema definitions. + +If present, the `legolas_schema_qualified` field's value must be a [fully qualified schema version identifier](@ref schema_version_identifier_specification). + +## Arrow File Naming Conventions + +When writing a Legolas-discoverable Arrow table to a file, prefer using the file extension `*..arrow`. For example, if the file's table's full Legolas schema version identifier is `baz.supercar@1>bar.automobile@1`, use the file extension `*.baz.supercar.arrow`. diff --git a/docs/src/faq.md b/docs/src/faq.md index 004fce3..acf844f 100644 --- a/docs/src/faq.md +++ b/docs/src/faq.md @@ -2,12 +2,16 @@ ## What is the point of Legolas.jl? Who benefits from using it? -At its core, Legolas.jl provides a lightweight, expressive set of mechanisms/patterns for generating `Tables.AbstractRow` types in a manner that enables schema composability, extensibility and a few nice utilties on top. +At its core, Legolas.jl provides a lightweight, expressive set of mechanisms/patterns for wrangling Tables.jl-compliant values in a manner that enables schema composability, extensibility and a few nice utilties on top. The package originated from code developed internally at Beacon to wrangling heterogeneous Arrow datasets, and is thus probably mostly useful for folks in a similar situation. If you're curating tabular datasets and you'd like to build shared Julia tools atop the schemas therein, then Legolas.jl may be worth checking out. ## Why does Legolas.jl support Arrow as a (de)serialization target, but not, say, JSON? -Technically, Legolas.jl's core `Row`/`Schema` functionality is totally agnostic to (de)serialization and could be useful for anybody who wants to generate new `Tables.AbstractRow` types. +Technically, Legolas.jl's core `@schema`/`@version` functionality is agnostic to (de)serialization and could be useful for anybody who wants to wrangle Tables.jl-compliant values. -Otherwise, with regards to (de)serialization-specific functionality, Beacon has put effort into ensuring Legolas.jl works well with [Arrow.jl](https://github.com/JuliaData/Arrow.jl) "by default" simply because we're heavy users of the Arrow format. There's nothing stopping users from composing the package with [JSON3.jl](https://github.com/quinnj/JSON3.jl) or other packages. \ No newline at end of file +Otherwise, with regards to (de)serialization-specific functionality, Beacon has put effort into ensuring Legolas.jl works well with [Arrow.jl](https://github.com/JuliaData/Arrow.jl) "by default" simply because we're heavy users of the Arrow format. There's nothing stopping users from composing the package with [JSON3.jl](https://github.com/quinnj/JSON3.jl) or other packages. + +## Why are Legolas.jl's generated record types defined the way that they are? For example, why is the version number hardcoded in the type name? + +Many of Legolas' current choices on this front stem from refactoring efforts undertaken as part of [this pull request](https://github.com/beacon-biosignals/Legolas.jl/pull/54), and directly resulted from a [design mini-investigation](https://gist.github.com/jrevels/fdfe939109bee23566d425440b7c759e) associated with those efforts. diff --git a/docs/src/index.md b/docs/src/index.md index 85248b4..043fd63 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,29 +1,37 @@ # API Documentation -If you're a newcomer to Legolas.jl, please familiarize yourself with via the [tour](https://github.com/beacon-biosignals/Legolas.jl/blob/master/examples/tour.jl) before diving into this documentation. +!!! note + + If you're a newcomer to Legolas.jl, please familiarize yourself with the [tour](https://github.com/beacon-biosignals/Legolas.jl/blob/main/examples/tour.jl) before diving into this documentation. ```@meta CurrentModule = Legolas ``` -## Legolas `Schema`s and `Row`s +## Legolas `Schema`s ```@docs -Legolas.@row -Legolas.Row -Legolas.Schema +Legolas.SchemaVersion +Legolas.@schema +Legolas.@version Legolas.is_valid_schema_name -Legolas.schema_name -Legolas.schema_version -Legolas.schema_qualified_string -Legolas.schema_parent +Legolas.parse_identifier +Legolas.name +Legolas.version +Legolas.identifier +Legolas.parent +Legolas.required_fields +Legolas.declaration +Legolas.declared +Legolas.find_violation +Legolas.complies_with +Legolas.validate ``` ## Validating/Writing/Reading Legolas Tables ```@docs -Legolas.extract_schema -Legolas.validate +Legolas.extract_schema_version Legolas.write Legolas.read ``` diff --git a/docs/src/schema-concepts.md b/docs/src/schema-concepts.md new file mode 100644 index 0000000..7ce7ef8 --- /dev/null +++ b/docs/src/schema-concepts.md @@ -0,0 +1,52 @@ +# Schema-Related Concepts/Conventions + +!!! note + + If you're a newcomer to Legolas.jl, please familiarize yourself with the [tour](https://github.com/beacon-biosignals/Legolas.jl/blob/main/examples/tour.jl) before diving into this documentation. + +## [Schema Version Identifiers](@id schema_version_identifier_specification) + +Legolas defines "schema version identifiers" as strings of the form: + +- `name@version` where: + - `name` is a lowercase alphanumeric string and may include the special characters `.` and `-`. + - `version` is a non-negative integer. +- or, `x>y` where `x` and `y` are valid schema version identifiers and `>` denotes "extends from". + +A schema version identifier is said to be *fully qualified* if it includes the identifiers of all ancestors of the particular schema version that it directly identifies. + +Schema authors should follow the below conventions when choosing the name of a new schema: + +1. Include a namespace. For example, assuming the schema is defined in a package Foo.jl, `foo.automobile` is good, `automobile` is bad. +2. Prefer singular over plural. For example, `foo.automobile` is good, `foo.automobiles` is bad. +3. Don't "overqualify" a schema name with ancestor-derived information that is better captured by the fully qualified identifier of a specific schema version. For example, `bar.automobile` should be preferred over `bar.foo.automobile`, since `bar.automobile@1>foo.automobile@1` is preferable to `bar.foo.automobile@1>foo.automobile@1`. Similarly, `baz.supercar` should be preferred over `baz.automobile.supercar`, since `baz.supercar@1>bar.automobile@1` is preferable to `baz.automobile.supercar@1>bar.automobile@1`. + +## Schema Versioning: You Break It, You Bump It + +While it is fairly established practice to [semantically version source code](https://semver.org/), the world of data/artifact versioning is a bit more varied. As presented in the tour, each `Legolas.SchemaVersion` carries a single version integer. The central rule that governs Legolas' schema versioning approach is: + +**Do not introduce a change to an existing schema version that might cause existing compliant data to become non-compliant; instead, incorporate the intended change in a new schema version whose version number is one greater than the previous version number.** + +For example, a schema author must introduce a new schema version for any of the following changes: + +- A new type-restricted required field is added to the schema. +- An existing required field's type restriction is tightened. +- An existing required field is renamed. + +One benefit of Legolas' approach is that multiple schema versions may be defined in the same codebase, e.g. there's nothing that prevents `@version(FooV1, ...)` and `@version(FooV2, ...)` from being defined and utilized simultaneously. The source code that defines any given Legolas schema version and/or consumes/produces Legolas tables is presumably already semantically versioned, such that consumer/producer packages can determine their compatibility with each other in the usual manner via interpreting major/minor/patch increments. + +Note that it is preferable to avoid introducing new versions of an existing schema, if possible, in order to minimize code/data churn for downstream producers/consumers. Thus, authors should prefer conservative field type restrictions from the get-go. Remember: loosening a field type restriction is not a breaking change, but tightening one is. + +## Important Expectations Regarding Custom Field Assignments + +Schema authors should ensure that their `@version` declarations meet two important expectations so that generated record types behaves as intended: + +1. Custom field assignments should preserve the [idempotency](https://en.wikipedia.org/wiki/Idempotence) of record type constructors. +2. Custom field assignments should not observe mutable non-local state. + +Thus, given a Legolas-generated record type `R`, the following should hold for all valid values of `fields`: + +```jl +R(R(fields)) == R(fields) +R(fields) == R(fields) +``` diff --git a/docs/src/schema.md b/docs/src/schema.md deleted file mode 100644 index bcb4555..0000000 --- a/docs/src/schema.md +++ /dev/null @@ -1,41 +0,0 @@ -# Tips for Schema Authors - -If you're a newcomer to Legolas.jl, please familiarize yourself with via the [tour](https://github.com/beacon-biosignals/Legolas.jl/blob/master/examples/tour.jl) before diving into this documentation. - -## Simple Integer Versioning: You Break It, You Bump It - -While it is fairly established practice to [semantically version source code](https://semver.org/), the world of data/artifact versioning is a bit more varied. As presented in the tour, each `Legolas.Schema` has a single version integer. In this section, we'll discuss how to interpret this version integer. - -We start with an assumption: source code that defines any given Legolas schema and/or consumes/produces Legolas tables is already semantically versioned, such that consumer/producer packages can determine their compatibility with each other in the usual manner via interpreting major/minor/patch increments. - -With that in mind, here is the central guideline to obey as a Legolas schema author: - -**If an update is made to a schema that potentially requires existing data to be rewritten in order to comply with the updated schema, then the version integer associated with that schema should be incremented.** - -For example, you must increment the version integer if any of the following changes are made: - -- A new non-`>:Missing` required field is added to the schema. -- An existing required field's type restriction is tightened. -- An existing required field is renamed. - -## How to Avoid Breaking Changes - -It is preferable to avoid incrementing a schema's version integer ("making a breaking change") whenever possible to avoid code/data churn for consumers. Following the below guidelines should help make breaking changes less likely: - -1. Allow required fields to be `Missing` whenever reasonable. -2. Prefer conservative field type restrictions from the get-go, to avoid needing to tighten them later. -3. Handle/enforce "potential deprecation paths" in a required field's RHS definition when possible. For example, imagine a schema that contains a required field `id::Union{UUID,String} = id` where `id` is either a `UUID`, or a `String` that may be parsed as a `UUID`. Now, let's imagine we decided we wanted to update the schema such that new tables ALWAYS normalize `id` to a proper `UUID`. In this case, it is preferable to simply update this required field to `id::Union{UUID,String} = UUID(id)` instead of `id::UUID = id`. The latter is a breaking change that requires incrementing the schema's version integer, while the former achieves the same practical result without breaking consumers of old data. - -When making deprecation/API decisions, keep in mind that multiple schema versions may be defined in the same codebase; there's nothing that prevents `@row("my-schema@1", ...)` and `@row("my-schema@2", ...)` from being defined/utilized simultaneously. - -# Naming Conventions - -1. Include a namespace. For example, assuming the schema is defined in a package Foo.jl, `foo.automobile` is good, `automobile` is bad. -2. Prefer singular over plural. For example, `foo.automobile` is good, `foo.automobiles` is bad. -3. Don't overqualify the schema name; that's what the qualified schema identifier is for! For example, `bar.automobile@1>foo.automobile@1` is good, `baz.supercar@1>bar.automobile@1` is good, `bar.foo.automobile@1>foo.automobile@1` is bad, `baz.automobile.supercar@1>bar.automobile@1` is bad. -4. When writing tables to files, use `*..arrow` as the file extension. For example, `filename.baz.supercar.arrow` is good, `filename.baz.supercar.bar.automobile.arrow` is bad, `baz.supercar.arrow` is bad. - -# Other Tips For Composable Schemas - -1. Prefer [idempotency](https://en.wikipedia.org/wiki/Idempotence) in required field's RHS definitions. -2. Prefer authoring child schemas such that they are [Liskov substitutable](https://en.wikipedia.org/wiki/Liskov_substitution_principle) for their parents. A less fancy way of stating the same thing: try to ensure that your child schema only adds additional fields to the parent and doesn't alter existing fields. diff --git a/docs/src/specification.md b/docs/src/specification.md deleted file mode 100644 index d5ee1bf..0000000 --- a/docs/src/specification.md +++ /dev/null @@ -1,14 +0,0 @@ -# Legolas Table Specification - -Legolas.jl's target (de)serialization format, Arrow, already features wide cross-language adoption, enabling serialized "Legolas tables" to be seamlessly read into many non-Julia environments. This brief specification defines the requirements for any given serialized Arrow table to be considered a "valid Legolas table" agnostic to any particular implementation. - -Currently, there is only a single requirement: the presence of a `legolas_schema_qualified` field in the Arrow table's metadata. - -The `legolas_schema_qualified` field's value should be either: - -1. `name@version` where: - - `name` is a lowercase alphanumeric string and may include the special characters `.` and `-` - - `version` is a non-negative integer -2. `x>y` where `x` and `y` are valid `legolas_schema_qualified` strings - -This field may be employed by consumers to match deserialized Legolas tables to application-layer schema definitions. \ No newline at end of file diff --git a/examples/tour.jl b/examples/tour.jl index 4f1df3b..a8e6a7e 100644 --- a/examples/tour.jl +++ b/examples/tour.jl @@ -3,192 +3,411 @@ # functionality in a concrete manner, and so that we can ensure examples stay # current as the package evolves. -using Legolas, Arrow, Tables, Test +using Legolas, Arrow, Tables, Test, UUIDs -using Legolas: @row, Schema +using Legolas: @schema, @version, complies_with, find_violation, validate ##### -##### Introduction to the `@row` Macro and `Legolas.Row` constructors +##### Introduction ##### -# The most interesting bit of Legolas.jl functionality is the package's `@row` macro, which callers can use -# define new [`Tables.AbstractRow`-compliant](https://tables.juliadata.org/stable/#Tables.AbstractRow-1) -# row types that exhibit some opinionated (but desirable!) properties w.r.t. composability/extensibility. -# -# These row type properties are fundamental to the wider data curation patterns that Legolas.jl seeks to -# facilitate, so let's explore them before we dig deeper into Legolas.jl's other table-centric utilities. - -# Declare a `Legolas.Schema` with name `"my-schema"` at version `1` whose *required fields* -# `a`, `b`, `c`, `d`, and `e` are specified via the provided assignment expressions, then return -# a corresponding row type that matches the declared schema: -const MyRow = @row("my-schema@1", - a::Real = sin(a), - b::String = string(a, b, c), - c = [a, b, c], - d::Int, - e) - -# `MyRow` is thus an alias for the type returned by the `@row` macro: -@test MyRow == Legolas.Row{typeof(Schema("my-schema@1"))} - -# The `MyRow` type has several useful constructors. Let's start with the constructor that -# accepts all required fields as keyword arguments: -row = MyRow(a=1.5, b="hello", c="goodbye", d=2, e=["anything"]) - -# By examining `row`'s fields, we can see how the assignment expressions from `MyRow`'s -# `@row` declaration were applied in a simple linear fashion to the input arguments: -@test row.a == sin(1.5) -@test row.b == string(sin(1.5), "hello", "goodbye") -@test row.c == [sin(1.5), string(sin(1.5), "hello", "goodbye"), "goodbye"] -@test row.d == 2 -@test row.e == ["anything"] - -# In fact, the field assignment expressions provided to the `@row` macro are interpolated -# nearly as-is into the underlying code generated by `@row`. For example, the relevant code -# generated by `MyRow`'s `@row` declaration looks roughly like: + +# Legolas provides mechanisms for constructing, reading, writing, and validating +# Tables.jl-compliant values against extensible, versioned, user-specified *schemas*. + +# We'll dive into the extensibility and versioning aspects of Legolas later. For now, +# let's start the tour by declaring a new Legolas schema via the `@schema` macro. + +# Here, we declare a new schema named `example.foo`, specifying that Legolas should +# use the prefix `Foo` for all `example.foo`-related type definitions: +@schema "example.foo" Foo + +# The above schema declaration provides the necessary scaffolding to start declaring +# new *versions* of the `example.foo` schema. Schema version declarations specify the +# set of required fields that a given table (or row) must contain in order to comply +# with that schema version. Let's use the `@version` macro to declare an initial +# version of the `example.foo` schema with some required fields: +@version FooV1 begin + a::Real + b::String + c + d::AbstractVector +end + +# In the above declaration, the symbol `FooV1` can be broken into the prefix `Foo` (as +# specified in `example.foo`'s `@schema` declaration) and `1`, the integer that identifies +# this particular version of the `example.foo` schema. The `@version` macro requires this +# symbol to always follow this format (`$(prefix)V$(integer)`), because it generates two +# special types that match it. For example, our `@version` declaration above generated: # -# function Legolas._transform(::typeof(Legolas.Schema("my-schema", 1)); -# a=missing, b=missing, c=missing, d=missing, e=missing, -# other...) -# a::Real = sin(a) -# b::String = string(a, b, c) -# c::Any = [a, b, c] -# d::Int = d -# e::Any = e -# return (; a, b, c, d, e, other...) -# end +# - `FooV1`: A special subtype of `Tables.AbstractRow` whose fields match the corresponding +# schema version's declared required fields. +# - `FooV1SchemaVersion`: An alias for `Legolas.SchemaVersion` that matches the corresponding +# schema version. + +# Let's first examine `FooV1SchemaVersion`: +@test Legolas.SchemaVersion("example.foo", 1) == FooV1SchemaVersion() +@test Legolas.SchemaVersion("example.foo", 1) isa FooV1SchemaVersion +@test "example.foo@1" == Legolas.identifier(FooV1SchemaVersion()) + +# As you can see, Legolas' Julia-agnostic identifier for this schema version is `example.foo@1`. +# To avoid confusion throughout this tour, we'll use this Julia-agnostic identifier to refer to +# individual schema versions in the abstract sense, while we'll use the relevant `SchemaVersion` +# aliases to specifically refer to the types that represent schema versions in Julia. + +##### +##### `Tables.Schema` Compliance/Validation +##### + +# We can use `complies_with`, `validate`, and `find_violation` to check whether a given +# `Tables.Schema` (ref https://tables.juliadata.org/stable/#Tables.Schema) complies with +# `example.foo@1`. + +# For example, all of the following `Tables.Schema`s comply with `example.foo@1`: +for s in [Tables.Schema((:a, :b, :c, :d), (Real, String, Any, AbstractVector)), # All required fields must be present... + Tables.Schema((:a, :b, :c, :d), (Int, String, Float64, Vector)), # ...and have subtypes that match the schema's declared type constraints. + Tables.Schema((:b, :a, :d, :c), (String, Int, Vector, Float64)), # Fields do not have to be in any particular order, as long as they are present. + Tables.Schema((:a, :b, :d), (Int, String, Vector)), # Fields whose declared type constraints are `>:Missing` may be elided entirely. + Tables.Schema((:a, :x, :b, :y, :d), (Int, Any, String, Any, Vector))] # Non-required fields may also be present. + # if `complies_with` finds a violation, it returns `false`; returns `true` otherwise + @test complies_with(s, FooV1SchemaVersion()) + + # if `validate` finds a violation, it throws an error indicating the violation; + # returns `nothing` otherwise + @test validate(s, FooV1SchemaVersion()) isa Nothing + + # if `find_violation` finds a violation, it returns a tuple indicating the relevant + # field and its violation; returns `nothing` otherwise + @test isnothing(find_violation(s, FooV1SchemaVersion())) +end + +# ...while the below `Tables.Schema`s do not: + +s = Tables.Schema((:a, :c, :d), (Int, Float64, Vector)) # The required non-`>:Missing` field `b::String` is not present. +@test !complies_with(s, FooV1SchemaVersion()) +@test_throws ArgumentError validate(s, FooV1SchemaVersion()) +@test isequal(find_violation(s, FooV1SchemaVersion()), :b => missing) + +s = Tables.Schema((:a, :b, :c, :d), (Int, String, Float64, Any)) # The type of required field `d::AbstractVector` is not `<:AbstractVector`. +@test !complies_with(s, FooV1SchemaVersion()) +@test_throws ArgumentError validate(s, FooV1SchemaVersion()) +@test isequal(find_violation(s, FooV1SchemaVersion()), :d => Any) + +# The expectations that characterize Legolas' particular notion of "schematic compliance" - requiring the +# presence of pre-specified declared fields, assuming non-present fields to be implicitly `missing`, and allowing +# the presence of non-required fields - were chosen such that the question "Does the table `t` comply with the Legolas +# schema version `s`?" is roughly equivalent to "Can a logical view be trivially constructed atop table `t` that contains +# only the required fields declared by `s`?" The ability to cleanly ask this question enables a weak notion of "subtyping" +# (see https://en.wikipedia.org/wiki/Duck_typing, https://en.wikipedia.org/wiki/Liskov_substitution_principle) that is +# core to Legolas' mechanisms for defining, extending, and versioning interfaces to tabular data. + +##### +##### Legolas-Generated Record Types +##### + +# As mentioned in this tour's introduction, `FooV1` is a subtype of `Tables.AbstractRow` whose fields are guaranteed to +# match all the fields required by `example.foo@1`. We refer to such Legolas-generated types as "record types" (see +# https://en.wikipedia.org/wiki/Record_(computer_science)). These record types are direct subtypes of +# `Legolas.AbstractRecord`, which is, itself, a subtype of `Tables.AbstractRow`: +@test FooV1 <: Legolas.AbstractRecord <: Tables.AbstractRow + +# Record type constructors accept keyword arguments or `Tables.AbstractRow`-compliant values: +fields = (a=1.0, b="hi", c=π, d=[1, 2, 3]) +@test NamedTuple(FooV1(; fields...)) == fields +@test NamedTuple(FooV1(fields)) == fields + +# This may seem like a fairly trivial constructor in the preceding example, but it has some properties +# that can be quite convenient in practice. Specifically, row values provided to `FooV1` may: # -# This `Legolas._transform` method is invoked at the core of the `MyRow` constructor. As you -# might have noticed, this method has two interesting properties we haven't yet demonstrated. +# - ...contain the associated schema version's required fields in any order +# - ...elide required fields, in which case the constructor will assume them to be `missing` +# - ...contain any other fields in addition to the required fields; such additional fields are simply ignored # -# Here we demonstrate the first property - required fields have a `missing` value by default: -@test isequal(MyRow(a=1.5, b="hello", c="goodbye", d=2), MyRow((; a=1.5, b="hello", c="goodbye", d=2, e=missing))) -@test_throws MethodError MyRow(a=1.5, b="hello", c="goodbye") # correctly throws a `MethodError` when evaluating `d::Int = missing` +# Demonstrating a few of these properties: -# And here's a demonstration of the second property - callers may pass in any other fields in -# addition to the required fields: -row = MyRow(a=1.5, b="hello", c="goodbye", my_other_field=":)", d=2, e=["anything"]) -@test row.my_other_field == ":)" +# Providing the additional non-required field `x` in the input, which is simply ignored: +fields_with_x = (; fields..., x="x") +@test NamedTuple(FooV1(fields_with_x)) == fields -# Finally, there's also a `MyRow` constructor that accepts any `Tables.AbstractRow`-compliant value, -# and extracts all input fields from that value. Here, we demonstrate with a `NamedTuple`: -@test row == MyRow((a=1.5, b="hello", c="goodbye", d=2, e=["anything"], my_other_field=":)")) +# Eliding the required field `c`, which is assigned `missing` in the output: +foo = FooV1(; a=1.0, b="hi", d=[1, 2, 3]) +@test isequal(NamedTuple(foo), (a=1.0, b="hi", c=missing, d=[1, 2, 3])) -# To summarize: +# Providing the non-compliantly-typed field `d::Int`, inducing a `MethodError`: +@test_throws MethodError FooV1(; a=1.0, b="hi", d=2) + +# Implicitly providing the non-compliantly-typed field `d::Missing`, inducing a `MethodError`: +@test_throws MethodError FooV1(; a=1.0, b="hi") + +##### +##### Custom Field Assignments +##### + +# Schema authors may tailor the behavior of `row` for their schema by defining custom field +# assignments in the schema's declaration. The `example.foo@1` declaration doesn't feature +# any such assignments, so let's declare a new schema version `example.bar@1` that does: +@schema "example.bar" Bar + +@version BarV1 begin + x::Union{Int8,Missing} = ismissing(x) ? x : Int8(clamp(x, -128, 127)) + y::String = string(y) + z::String = ismissing(z) ? string(y, '_', x) : z +end + +# These assignment statements are inlined into `BarV1`'s inner constructor +# definition, such that it is roughly equivalent to: +# +# function BarV1(; x=missing, y=missing) +# x::Union{Int8,Missing} = ismissing(x) ? x : Int8(clamp(x, -128, 127)) +# y::String = string(y, '_', x) +# return new(x, y) +# end +# +# ...so that invocations `BarV1(; ...)` have the following behavior: +@test NamedTuple(BarV1(; x=200, y=:hi)) == (x=127, y="hi", z="hi_127") +@test isequal(NamedTuple(BarV1(; y=:hi)), (x=missing, y="hi", z="hi_missing")) +@test NamedTuple(BarV1(; x=200, y=:hi, z="bye")) == (x=127, y="hi", z="bye") + +# Custom field assignments enable schema authors to enforce value-level constraints and to imbue +# record type constructors with convenient per-field transformations/conversions so that they can +# accept a wider range of applicable inputs for each field. However, schema authors that use custom +# field assignments must always take care to preserve idempotency and avoid side effects / reliance +# on non-local state. +# +# In other words, given a record type `R` generated from a non-pathological `@version` declaration, +# we'd expect the following equivalences to hold: +# +# R(fields) == R(fields) +# R(R(fields)) == R(fields) # -# - Inputs to `Legolas.Row` constructors... -# - ...may be any `Tables.AbstractRow`-compliant value -# - ...may contain required fields in any order -# - ...may elide required fields, in which case the constructor will assume them to be `missing` -# - ...may contain any other fields in addition to the required fields -# - Outputs of `Legolas.Row` constructors... -# - ...will contain all required fields ("missing" fields are explicitly presented with `missing` values) -# - ...will contain all provided non-required fields - -##### -##### Extending Existing Rows/Schemas -##### -# Row types declared via `@row` can inherit the fields and transformations specified by other `@row`-declared types. -# Niftily, the properties of `Legolas.Row` that we demonstrated above enable this extension mechanism to be -# implemented via composition under the hood. - -# Declare a row type whose schema is named `"my-child-schema"` at version `1` that inherits the fields of the -# `my-schema@1` schema that we defined in the previous section. -const MyChildRow = @row("my-child-schema@1" > "my-schema@1", - d, # "declaring" the underlying `my-schema@1` field here so that it - # can be referenced in our definition for the `f` field. - f::Int = f + d, - g::String, - c = last(c)) - -# The constructor for `MyChildRow` will first apply its parent's transformation before applying its own. The -# effect of this behavior can be seen clearly in the `c` field value in the following example: -input = (a=1.5, b="hello", c="goodbye", d=2, e=["anything"], f=3, g="foo") -child = MyChildRow(input) -@test child.a == sin(1.5) -@test child.b == string(sin(1.5), "hello", "goodbye") -@test child.c == "goodbye" -@test child.d == 2 -@test child.e == ["anything"] -@test child.f == 5 -@test child.g == "foo" - -# Note that even though we didn't write down any constraints on `d` in our `my-child-schema@1` definition, -# that field still undergoes the parent transformation (defined by `my-schema@1`) where it is constrained -# to `d::Int`. -@test_throws InexactError MyChildRow(Tables.rowmerge(child; d=1.5)) - -# A note on syntax: You might ask "Why use `>` as the inheritance operator instead of `<:`?" There are actually three reasons. Firstly, -# `<:` is canonically a subtyping operator that implies the Liskov substition principle, but because Legolas allow arbitrary RHS code in -# required field declarations, a "child" row is not de facto substitutable for its parent. Secondly, `>` implies the actual ordering that -# row transformations are applied in; the parent transformation comes before the child transformation. Thirdly, the child row will usually -# (though technically not always) have a greater total number of required fields than the parent row. - -##### -##### Validating/Writing/Reading `Arrow.Table`s with Legolas.jl -##### - -# Legolas provides special methods for reading/writing/validating Arrow tables that utilize `Legolas.Schema`s. To -# start exploring these methods, we'll first construct a dummy table using `row::MyRow` from the previous section: -rows = [row, - Tables.rowmerge(row; b="a different one"), - Tables.rowmerge(row; e=:anything)] - -# We can validate that `rows` is compliant w.r.t. `schema` via `Legolas.validate`, which will throw a descriptive -# error if the `Tables.schema(rows)` mismatches with the required columns/types specified by the `schema`. -schema = Schema("my-schema", 1) -@test schema === Schema("my-schema@1") # `Schema("my-schema", 1)` is an alternative to `Schema("my-schema@1")` -@test (Legolas.validate(rows, schema); true) -invalid = vcat(rows, Tables.rowmerge(row; a="this violates the schema's `a::Real` requirement")) -@test_throws ArgumentError("field `a` has unexpected type; expected <:Real, found Any") Legolas.validate(invalid, schema) - -# This highlights two important properties regarding `Legolas.Schema` validation: +# These are two very important expectations that must be met for record types to behave as intended, +# but they are not enforceable by Legolas itself, since Legolas allows custom field assignments +# to include arbitrary Julia code; thus, schema authors are responsible for not violating these +# expectations. # -# - First, it's okay that the `e` field isn't present in this `Tables.Schema` because `my-schema` permits `e::Missing`. -# - Second, field ordering is unimportant and is not considered when determining whether a give `Tables.` -@test (Legolas.validate(Tables.Schema((:c, :d, :a, :b), Tuple{Vector,Int,Float64,String}), schema); true) +# Let's check that `BarV1` meets these expectations: +fields = (x=200, y=:hi) +@test BarV1(fields) == BarV1(fields) +@test BarV1(BarV1(fields)) == BarV1(fields) + +# For illustration's sake, here is an example of a pathological `@version` declaration that violates +# both of these expectations: +const GLOBAL_STATE = Ref(0) + +@schema "example.bad" Bad + +@version BadV1 begin + x::Int = x + 1 + y = (GLOBAL_STATE[] += y; GLOBAL_STATE[]) +end + +fields = (x=1, y=1) + +# Demonstration of non-idempotency, both in `x` and `y` fields: +@test BadV1(BadV1(fields)) != BadV1(fields) + +# Demonstration of side effects / reliance on non-local state in `y` field: +@test BadV1(fields) != BadV1(fields) + +##### +##### Extending Existing Schema Versions +##### + +# New schema versions can inherit other schema version's required fields. Here, we declare `example.baz@1` +# as an "extension" of `example.bar@1`: +@schema "example.baz" Baz + +@version BazV1 > BarV1 begin + x::Int8 + z::String + k::Int64 = ismissing(k) ? length(z) : k +end + +# Notice how the child's `@version` declaration may reference the parent's required fields (but need not reference +# every single one), may tighten the constraints of the parent's required fields, and may introduce new required +# fields atop the parent's required fields. -# Legolas also provides `Legolas.write(path_or_io, table, schema; kwargs...)`, which wraps `Arrow.write(path_or_io, table; kwargs...)` +# For a given Legolas schema version extension to be valid, all `Tables.Schema`s that comply with the child +# must comply with the parent, but the reverse need not be true. We can check a schema version's required fields +# and their type constraints via `Legolas.required_fields`. Based on these outputs, it is a worthwhile exercise +# to confirm for yourself that `BazV1SchemaVersion` is a valid extension of `BarV1SchemaVersion` under the aforementioned rule: +@test Legolas.required_fields(BarV1SchemaVersion()) == (x=Union{Missing,Int8}, y=String, z=String) +@test Legolas.required_fields(BazV1SchemaVersion()) == (x=Int8, y=String, z=String, k=Int64) + +# As a counterexample, the following is invalid, because the declaration of `x::Any` would allow for `x` +# values that are disallowed by the parent schema version `example.bar@1`: +@schema "example.broken" Broken +@test_throws Legolas.SchemaVersionDeclarationError @version BrokenV1 > BarV1 begin x::Any end + +# Record type constructors generated for extension schema versions will apply the parent's field +# assignments before applying the child's field assignments. Notice how `BazV1` applies the +# constraints/transformations of both `example.baz@1` and `example.bar@1`: +@test NamedTuple(BazV1(; x=200, y=:hi)) == (x=127, y="hi", z="hi_127", k=6) +@test_throws MethodError BazV1(; y=:hi) # `example.baz@1` does not allow `x::Missing` + +# `BazV1`'s inner constructor definition is roughly equivalent to: +# +# function BazV1(; x=missing, y=missing, z=missing, k=missing) +# __p__ = BarV1(; x, y, z) +# x, y, z = __p__.x, __p__.y, __p__.z +# x::Int8 = x +# z::String = z +# k::Int = length(z) +# return new(x, y, z, k) +# end + +# One last note on syntax: You might ask "Why use the greater-than symbol as the inheritance operator instead of `<:`?" +# There are a few reasons. The primary reason is purely historical: earlier versions of Legolas did not as rigorously +# demand/enforce subtyping relationships between parent and child schemas' required fields, and so the `<:` operator +# was considered to be a bit too misleading. A secondary reason in favor of `>` was that it implied the actual order +# of application of constraints (i.e. the parent's are applied before the child's). Lastly, `>` aligns well with the +# property that child schema versions have a greater number of constraints than their parents. + +##### +##### Schema Versioning +##### + +# Throughout this tour, all `@version` declarations have used the version number `1`. As you might guess, you can +# declare more than a single version of any given schema. Here's an example using the `example.foo` schema we defined +# earlier: + +@version FooV2 begin + a::Float64 + b::String + c::Int + d::Vector +end + +@test FooV2SchemaVersion() == Legolas.SchemaVersion("example.foo", 2) + +fields = (a=1.0, b="b", c=3, d=[1,2,3]) +@test NamedTuple(FooV2(fields)) == fields + +# A schema author generally needs to declare a new schema version whenever they introduce that changes are +# considered "breaking" in a very particular Legolas-defined manner. We're not going to dive into this aspect +# of Legolas here in the tour, but please refer to this section of Legolas' documentation for more details: +# https://beacon-biosignals.github.io/Legolas.jl/stable/schema-concepts/#Schema-Versioning:-You-Break-It,-You-Bump-It-1 + +##### +##### Parameterizing Required Field Types +##### + +# Sometimes, it's useful to surface a required field's type as a type parameter of the generated record type. To +# achieve this, the `@version` macro supports use of the `<:` operator to mark fields whose types should be exposed +# as parameters. For example: + +@schema "example.param" Param + +@version ParamV1 begin + a::Int + b::(<:Real) + c + d::(<:Union{Real,Missing}) +end + +@test typeof(ParamV1(a=1, b=2.0, c="3")) === ParamV1{Float64,Missing} +@test typeof(ParamV1(a=1, b=2.0, c="3", d=4)) === ParamV1{Float64,Int} +@test typeof(ParamV1{Int,Missing}(a=1, b=2.0, c="3")) === ParamV1{Int,Missing} +@test typeof(ParamV1{Int,Float32}(a=1, b=2.0, c="3", d=1)) === ParamV1{Int,Float32} + +# Note that extension schema versions do not implicitly "inherit" their parent's type parameters; if you'd +# like to parameterize the type of a parent's required field in the child schema version, you should explicitly +# include the field in the child's required field list: + +@schema "example.child-param" ChildParam + +@version ChildParamV1 > ParamV1 begin + c::(<:Union{Real,String}) + d::(<:Union{Real,Missing}) + e +end + +@test typeof(ChildParamV1(a=1, b=2.0, c="3", e=5)) === ChildParamV1{String,Missing} +@test typeof(ChildParamV1(a=1, b=2.0, c=3, d=4, e=5)) === ChildParamV1{Int,Int} +@test typeof(ChildParamV1{Int,Missing}(a=1, b=2.0, c=3.0, e=5)) === ChildParamV1{Int,Missing} +@test typeof(ChildParamV1{String,Float32}(a=1, b=2.0, c="3", d=1, e=5)) === ChildParamV1{String,Float32} + +##### +##### Validating/Writing/Reading Arrow Tables with Legolas.jl +##### + +# Legolas provides special methods for reading/writing/validating Arrow tables with `Legolas.SchemaVersion`s. +# To start exploring these methods, we'll first construct a dummy table using the previously defined `BazV1`: +table = [BazV1(; x=23, y=:beep), + BazV1(; x=200, y=:boop, k=4), + BazV1(; x=23, y=:buzz, z="some_other_value")] + +table_isequal(a, b) = isequal(Legolas.materialize(a), Legolas.materialize(b)) + +# `Legolas.write(dest, table, sv::Legolas.SchemaVersion; kwargs...)` wraps `Arrow.write(dest, table; kwargs...)` # and performs two additional operations atop the usual operations performed by that function: # -# - By default, the provided Tables.jl-compliant `table` is validated against `schema` via `Legolas.validate` before +# - By default, the provided Tables.jl-compliant `table` is validated against `sv` via `Legolas.validate` before # it is actually written out. Note that this can be disabled by passing `validate=false` to `Legolas.write`. # # - `Legolas.write` ensures that the written-out Arrow table's metadata contains a `"legolas_schema_qualified"` -# key whose value is `Legolas.schema_qualified_string(schema)`. This field enables consumers of the table to +# key whose value is `Legolas.schema_identifier(schema)`. This field enables consumers of the table to # perform automated (or manual) schema discovery/evolution/validation. - -schema = Schema("my-child-schema", 1) # For this example, let's use a schema that has a parent -rows = [child, - Tables.rowmerge(child; b="a different one"), - Tables.rowmerge(child; e=:anything)] io = IOBuffer() -Legolas.write(io, rows, schema) +Legolas.write(io, table, BazV1SchemaVersion()) t = Arrow.Table(seekstart(io)) +@test Arrow.getmetadata(t) == Dict("legolas_schema_qualified" => "example.baz@1>example.bar@1") +@test table_isequal(t, Arrow.Table(Arrow.tobuffer(table))) +@test table_isequal(t, Arrow.Table(Legolas.tobuffer(table, BazV1SchemaVersion()))) # `Legolas.tobuffer` is analogous to `Arrow.tobuffer` -table_isequal(a, b) = isequal(Legolas.materialize(a), Legolas.materialize(b)) - -@test Arrow.getmetadata(t) == Dict("legolas_schema_qualified" => "my-child-schema@1>my-schema@1") -@test table_isequal(t, Arrow.Table(Arrow.tobuffer(rows))) -@test table_isequal(t, Arrow.Table(Legolas.tobuffer(rows, schema))) # `Legolas.tobuffer` is analogous to `Arrow.tobuffer` - -invalid = vcat(rows, Tables.rowmerge(child; a="this violates the schema's `a::Real` requirement")) -@test_throws ArgumentError("field `a` has unexpected type; expected <:Real, found Any") Legolas.tobuffer(invalid, schema) - -# Similarly, Legolas provides `Legolas.read(path_or_io)`, which wraps `Arrow.Table(path_or_io)` -# and performs `Legolas.validate` on the resulting `Arrow.Table` before returning it. -@test table_isequal(Legolas.read(Legolas.tobuffer(rows, schema)), t) +# Similarly, Legolas provides `Legolas.read(src)`, which wraps `Arrow.Table(src)`, but +# validates the deserialized `Arrow.Table` against its declared schema version before +# returning it: +@test table_isequal(Legolas.read(Legolas.tobuffer(table, BazV1SchemaVersion())), t) msg = """ - could not extract valid `Legolas.Schema` from provided Arrow table; - is it missing the expected custom metadata and/or the expected - \"legolas_schema_qualified\" field? + could not extract valid `Legolas.SchemaVersion` from the `Arrow.Table` read + via `Legolas.read`; is it missing the expected custom metadata and/or the + expected \"legolas_schema_qualified\" field? """ -@test_throws ArgumentError(msg) Legolas.read(Arrow.tobuffer(rows)) -invalid_but_has_schema_metadata = Arrow.tobuffer(invalid; - metadata = ("legolas_schema_qualified" => Legolas.schema_qualified_string(schema),)) -@test_throws ArgumentError("field `a` has unexpected type; expected <:Real, found Union{Missing, Float64, String}") Legolas.read(invalid_but_has_schema_metadata) +@test_throws ArgumentError(msg) Legolas.read(Arrow.tobuffer(table)) +invalid = [Tables.rowmerge(row; k=string(row.k)) for row in table] +invalid_but_has_metadata = Arrow.tobuffer(invalid; metadata=("legolas_schema_qualified" => Legolas.identifier(BazV1SchemaVersion()),)) +@test_throws ArgumentError("field `k` has unexpected type; expected <:Int64, found String") Legolas.read(invalid_but_has_metadata) # A note about one additional benefit of `Legolas.read`/`Legolas.write`: Unlike their Arrow.jl counterparts, # these functions are relatively agnostic to the types of provided path arguments. Generally, as long as a # given `path` supports `Base.read(path)::Vector{UInt8}` and `Base.write(path, bytes::Vector{UInt8})` then # `path` will work as an argument to `Legolas.read`/`Legolas.write`. At some point, we'd like to make similar # upstream improvements to Arrow.jl to render its API more path-type-agnostic. + +##### +##### Schema Version Portability (`Legolas.accepted_field_type`) +##### + +# Consider the following schema version: + +@schema "example.portable" Portable + +@version PortableV1 begin + id::UUID = UUID(id) +end + +# Here, `PortableV1` will convert inputs into `UUID`s as part of construction. This behavior may be desirable in many cases, +# but this definition actually has interesting implications for this schema's notion of compliance. In particular, this schema +# version carries an implicit requirement that schema-compliant Arrow data must be Julia-produced; Arrow itself doesn't define +# a native UUID type, so other languages may very well (de)serialize UUIDs as 128-bit binary values in a manner that Arrow.jl +# might not recognize as Julia's UUID type. + +# Thus, while this schema version implies that the only compliant `Tables.Schema` is `Tables.Schema((:id,), (UUID,))`, +# it is actually desirable to also consider `Tables.Schema((:id,), (UInt128,))` to be compliant in order to support +# non-Julia-produced data. It'd be annoying, though, to need to alter our `PortableV1` constructor just to suit this purpose, +# since its UUID conversion behavior (and the corresponding type constraint) may be useful for validated construction. + +# Luckily, it turns out that Legolas is actually smart enough to natively support this by default: +@test complies_with(Tables.Schema((:id,), (UUID,)), PortableV1SchemaVersion()) +@test complies_with(Tables.Schema((:id,), (UInt128,)), PortableV1SchemaVersion()) + +# How is this possible? Well, when Legolas checks whether a given field `f::T` matches a required field `f::F`, it doesn't +# directly check that `T <: F`; instead, it checks that `T <: Legolas.accepted_field_type(sv, F)` where `sv` is the relevant +# `SchemaVersion`. The fallback definition of `Legolas.accepted_field_type(::SchemaVersion, F::Type)` is simply `F`, but there +# are a few other default overloads to support common Base types that can cause portability issues: +# +# accepted_field_type(::SchemaVersion, ::Type{UUID}) = Union{UUID,UInt128} +# accepted_field_type(::SchemaVersion, ::Type{Symbol}) = Union{Symbol,String} +# +# Schema version authors should feel free to override these `Legolas.accepted_field_type` definitions (and/or add new definitions) +# for their own `SchemaVersion` types. diff --git a/src/Legolas.jl b/src/Legolas.jl index 76381ec..e92a818 100644 --- a/src/Legolas.jl +++ b/src/Legolas.jl @@ -1,13 +1,11 @@ module Legolas -using Tables, Arrow - - +using Tables, Arrow, UUIDs const LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY = "legolas_schema_qualified" include("lift.jl") -include("rows.jl") +include("schemas.jl") include("tables.jl") end # module diff --git a/src/lift.jl b/src/lift.jl index c5b7676..e97794e 100644 --- a/src/lift.jl +++ b/src/lift.jl @@ -19,7 +19,6 @@ Returns a curried function, `x -> lift(f,x)` """ lift(f) = Base.Fix1(lift, f) - """ construct(T::Type, x) diff --git a/src/rows.jl b/src/rows.jl deleted file mode 100644 index 92190b3..0000000 --- a/src/rows.jl +++ /dev/null @@ -1,350 +0,0 @@ -##### -##### Schema -##### - -const ALLOWED_SCHEMA_NAME_CHARACTERS = Char['-', '.', 'a':'z'..., '0':'9'...] - -""" - Legolas.is_valid_schema_name(x::AbstractString) - -Return `true` if `x` is a valid schema name, return `false` otherwise. - -Valid schema names are lowercase, alphanumeric, and may contain hyphens or periods. -""" -is_valid_schema_name(x::AbstractString) = all(i -> i in ALLOWED_SCHEMA_NAME_CHARACTERS, x) - -""" - Legolas.Schema{name,version} - -A type representing the schema of a [`Legolas.Row`](@ref). The `name` (a `Symbol`) and `version` (an `Integer`) -are surfaced as type parameters, allowing them to be utilized for dispatch. - -For more details and examples, please see `Legolas.jl/examples/tour.jl` and the "Tips for Schema Authors" -section of the Legolas.jl documentation. - -See also: [`schema_name`](@ref), [`schema_version`](@ref), [`schema_parent`](@ref) -""" -struct Schema{name,version} end - -Schema(schema::Schema) = schema - -# support (de)serialization of Schemas to Arrow -const LEGOLAS_SCHEMA_ARROW_NAME = Symbol("JuliaLang.Legolas.Schema") -Arrow.ArrowTypes.arrowname(::Type{<:Schema}) = LEGOLAS_SCHEMA_ARROW_NAME -Arrow.ArrowTypes.ArrowType(::Type{<:Schema}) = String -Arrow.ArrowTypes.toarrow(schema::Schema) = schema_qualified_string(schema) -Arrow.ArrowTypes.JuliaType(::Val{LEGOLAS_SCHEMA_ARROW_NAME}, ::Any) = Schema -Arrow.ArrowTypes.fromarrow(::Type{<:Schema}, qualified_string) = Schema(qualified_string) - -""" - Legolas.Schema(name::AbstractString, version::Integer) - -Return `Legolas.Schema{Symbol(name),version}()`. This constructor will throw an `ArgumentError` if `name` is -not a valid schema name. - -Prefer using this constructor over `Legolas.Schema{Symbol(name),version}()` directly. -""" -function Schema(name::AbstractString, version::Integer) - version >= 0 || throw(ArgumentError("`Legolas.Schema` version must be non-negative, recieved: $version")) - is_valid_schema_name(name) || throw(ArgumentError("argument is not a valid `Legolas.Schema` name: \"$name\"")) - return Schema{Symbol(name),version}() -end - -""" - Legolas.Schema(s::AbstractString) - -Return `Legolas.Schema(name, n)` where `s` is a valid schema identifier of the form `"name@n"`. - -`s` may also be a fully qualified schema identifier of the form `"name@n>...>..."`. -""" -function Schema(s::AbstractString) - x = split(first(split(s, '>', limit=2)), '@') - if length(x) == 2 - name, version = x - version = tryparse(Int, version) - version isa Int && return Schema(name, version) - end - throw(ArgumentError("argument is not a valid `Legolas.Schema` string: \"$s\"")) -end - -""" - schema_name(::Type{<:Legolas.Schema{name}}) - schema_name(::Legolas.Schema{name}) - -Return `name`. -""" -@inline schema_name(::Type{<:Schema{name}}) where {name} = name -@inline schema_name(schema::Schema) = schema_name(typeof(schema)) - -""" - schema_version(::Type{Legolas.Schema{name,version}}) - schema_version(::Legolas.Schema{name,version}) - -Return `version`. -""" -@inline schema_version(::Type{<:Schema{name,version}}) where {name,version} = version -@inline schema_version(schema::Schema) = schema_version(typeof(schema)) - -""" - schema_parent(::Type{Legolas.Schema{name,version}}) - schema_parent(::Legolas.Schema{name,version}) - -Return the `Legolas.Schema` instance that corresponds to the parent of the given `Legolas.Schema`. -""" -@inline schema_parent(::Type{<:Schema}) = nothing -@inline schema_parent(schema::Schema) = schema_parent(typeof(schema)) - -Base.show(io::IO, schema::Schema) = print(io, "Schema(\"$(schema_name(schema))@$(schema_version(schema))\")") - -##### -##### methods overloaded by `@row` -##### - -struct UnknownSchemaError <: Exception - schema::Legolas.Schema -end - -function Base.showerror(io::IO, e::UnknownSchemaError) - print(io, """ - encountered unknown `Legolas.Schema` type: $(e.schema) - - This generally indicates that this schema has not been defined (i.e. - the schema's corresponding `@row` statement has not been executed) in - the current Julia session. - - In practice, this can arise if you try to read a Legolas table with a - prescribed schema, but haven't actually loaded the schema definition - (or commonly, haven't loaded the dependency that contains the schema - definition - check the versions of loaded packages/modules to confirm - your environment is as expected). - - Note that if you're in this particular situation, you can still load - the raw table as-is without Legolas; e.g., to load an Arrow table, call `Arrow.Table(path)`. - """) - return nothing -end - -# Note that there exist very clean generic implementations of `transform`/`validate`: -# -# function transform(schema::Schema; fields...) -# parent = schema_parent(schema) -# parent isa Schema && (fields = transform(parent; fields...)) -# return _transform(schema; fields...) -# end -# -# function validate(tables_schema::Tables.Schema, legolas_schema::Schema) -# parent = schema_parent(legolas_schema) -# parent isa Schema && validate(parent, tables_schema) -# _validate(tables_schema, legolas_schema) -# return nothing -# end -# -# However, basic benchmarking demonstrates that the above versions can allocate -# unnecessarily for schemas with a few ancestors, while the "hardcoded" versions -# generated by the current implementation of the `@row` macro (see below) do not. - -transform(s::Legolas.Schema; fields...) = throw(UnknownSchemaError(s)) - -function _transform end - -""" - Legolas.validate(tables_schema::Tables.Schema, legolas_schema::Legolas.Schema) - -Throws an `ArgumentError` if `tables_schema` does not comply with `legolas_schema`, otherwise -returns `nothing`. - -Specifically, `tables_schema` is considered to comply with `legolas_schema` if: - -- every non-`>:Missing` field required by `legolas_schema` is present in `tables_schema`. -- `T <: S` for each field `f::T` in `tables_schema` that matches a required `legolas_schema` field `f::S`. -""" -validate(::Tables.Schema, s::Legolas.Schema) = throw(UnknownSchemaError(s)) - -function _validate end - -function validate_expected_field(schema::Tables.Schema, name::Symbol, ::Type{T}) where {T} - i = findfirst(==(name), schema.names) - if isnothing(i) - Missing <: T || throw(ArgumentError("could not find expected field `$name::$T` in $schema")) - else - schema.types[i] <: T || throw(ArgumentError("field `$name` has unexpected type; expected <:$T, found $(schema.types[i])")) - end - return nothing -end - -""" - schema_qualified_string(::Legolas.Schema{name,version}) - -Return this `Legolas.Schema`'s fully qualified schema identifier string. This string is -serialized as the `\"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\"` field value in table -metadata for table written via [`Legolas.write`](@ref). -""" -schema_qualified_string(s::Legolas.Schema) = throw(UnknownSchemaError(s)) - -##### -##### Row -##### - -""" - Legolas.Row(schema::Schema; fields...) - Legolas.Row(schema::Schema, row) - -Return a `Legolas.Row <: Tables.AbstractRow` instance whose fields are the provided `fields` -(or the fields of `row`) validated/transformed in accordance with provided `schema`. - -For more details and examples, please see `Legolas.jl/examples/tour.jl`. -""" -struct Row{S<:Schema,F} <: Tables.AbstractRow - schema::S - fields::F - function Row(schema::Schema; fields...) - fields = transform(schema; fields...) - return new{typeof(schema),typeof(fields)}(schema, fields) - end -end - -Row{S}(args...; kwargs...) where {S} = Row(S(), args...; kwargs...) - -Row(schema::Schema, fields) = Row(schema, NamedTuple(Tables.Row(fields))) -Row(schema::Schema, fields::Row) = Row(schema, getfield(fields, :fields)) -Row(schema::Schema, fields::NamedTuple) = Row(schema; fields...) - -Base.propertynames(row::Row) = propertynames(getfield(row, :fields)) -Base.getproperty(row::Row, name::Symbol) = getproperty(getfield(row, :fields), name) - -Tables.getcolumn(row::Row, i::Int) = Tables.getcolumn(getfield(row, :fields), i) -Tables.getcolumn(row::Row, nm::Symbol) = Tables.getcolumn(getfield(row, :fields), nm) -Tables.columnnames(row::Row) = Tables.columnnames(getfield(row, :fields)) - -Base.:(==)(a::Row, b::Row) = getfield(a, :schema) == getfield(b, :schema) && getfield(a, :fields) == getfield(b, :fields) -Base.isequal(a::Row, b::Row) = isequal(getfield(a, :schema), getfield(b, :schema)) && isequal(getfield(a, :fields), getfield(b, :fields)) -Base.hash(a::Row, h::UInt) = hash(Row, hash(getfield(a, :schema), hash(getfield(a, :fields), h))) - -function Base.show(io::IO, row::Row) - print(io, "Row($(getfield(row, :schema)), ") - show(io, getfield(row, :fields)) - print(io, ")") - return nothing -end - -function _parse_schema_expr(x) - if x isa Expr && x.head == :call && x.args[1] == :> && length(x.args) == 3 - child, _ = _parse_schema_expr(x.args[2]) - parent, _ = _parse_schema_expr(x.args[3]) - return child, parent - end - return nothing, nothing -end - -_parse_schema_expr(str::AbstractString) = Schema(str), nothing - -""" - @row("name@version", field_expressions...) - @row("name@version" > "parent_name@parent_version", field_expressions...) - -Define a new `Legolas.Schema{name,version}` whose required fields are specified by `field_expressions`. -Returns `Legolas.Row{Legolas.Schema{name,version}}` which can be conveniently aliased to the caller's -preferred binding for a row constructor associated with `Legolas.Schema{name,version}`. - -Each element of `field_expression` defines a required field for `Legolas.Schema{name,version}`, and is -an expression of the form `field::F = rhs` where: - -- `field` is the corresponding field's name -- `::F` denotes the field's type constraint (if elided, defaults to `::Any`). -- `rhs` is the expression which produces `field::F` (if elided, defaults to `field`). - -As implied above, the following alternative forms are also allowed: - -- `field::F` (interpreted as `field::F = field`) -- `field = rhs` (interpreted as `field::Any = rhs`) -- `field` (interpreted as `field::Any = field`) - -For more details and examples, please see `Legolas.jl/examples/tour.jl` and the "Tips for Schema Authors" -section of the Legolas.jl documentation. -""" -macro row(schema_expr, fields...) - schema, parent = _parse_schema_expr(schema_expr) - isnothing(schema) && throw(ArgumentError("`@row` schema argument must be of the form `\"name@X\"` or `\"name@X\" > \"parent@Y\"`. Received: $schema_expr")) - fields = map(fields) do f - original_f = f - f isa Symbol && (f = Expr(:(::), f, :Any)) - f.head == :(::) && (f = Expr(:(=), f, f.args[1])) - f.head == :(=) && f.args[1] isa Symbol && (f.args[1] = Expr(:(::), f.args[1], :Any)) - f.head == :(=) && f.args[1].head == :(::) || throw(ArgumentError("malformed `@row` field expression: $original_f")) - return f - end - validate_fields = map(fields) do f - name, type = f.args[1].args - return :(validate_expected_field(tables_schema, $(Base.Meta.quot(name)), $(esc(type)))) - end - field_names = [esc(f.args[1].args[1]) for f in fields] - schema_type = Base.Meta.quot(typeof(schema)) - quoted_parent = Base.Meta.quot(parent) - schema_qualified_string = string(schema_name(schema), '@', schema_version(schema)) - parent_transform = nothing - parent_validate = nothing - if !isnothing(parent) - schema_qualified_string = :(string($schema_qualified_string, '>', Legolas.schema_qualified_string($quoted_parent))) - parent_transform = :(fields = transform($quoted_parent; fields...)) - parent_validate = :(validate(tables_schema, $quoted_parent)) - end - - legolas_row_arrow_name = :(Symbol("JuliaLang.", $schema_qualified_string)) - return quote - Legolas.schema_qualified_string(::$schema_type) = $schema_qualified_string - - Legolas.schema_parent(::Type{<:$schema_type}) = $quoted_parent - - function Legolas._transform(::$schema_type; $([Expr(:kw, f, :missing) for f in field_names]...), other...) - $(map(esc, fields)...) - return (; $([Expr(:kw, f, f) for f in field_names]...), other...) - end - - function Legolas._validate(tables_schema::Tables.Schema, legolas_schema::$schema_type) - $(validate_fields...) - return nothing - end - - function Legolas.transform(schema::$schema_type; fields...) - $parent_transform - return _transform(schema; fields...) - end - - function Legolas.validate(tables_schema::Tables.Schema, legolas_schema::$schema_type) - $parent_validate - return _validate(tables_schema, legolas_schema) - end - - - # Support (de)serialization as an Arrow column value via Arrow.ArrowTypes overloads. - # - # Note that this only really works in relatively simple cases; rely on this at your own peril. - # See https://github.com/JuliaData/Arrow.jl/issues/230 for more details. - # - # Note also that the limited support here that DOES work participates in SemVer, - # e.g. if we break this in future Legolas versions we should treat it as a breaking - # change and bump version numbers accordingly. - - # We serialize as a triple of schema name, schema version, and fields. - # This is for backwards compatibility. With this approach, defining methods per-Row type, - # we could just serialize the fields alone. - # This approach allows nested arrow serialization to work, ref . - Arrow.ArrowTypes.arrowname(::Type{<:Legolas.Row{$schema_type}}) = $legolas_row_arrow_name - Arrow.ArrowTypes.ArrowType(::Type{Legolas.Row{$schema_type,F}}) where {F} = Tuple{String,Int,F} - Arrow.ArrowTypes.toarrow(row::Legolas.Row{$schema_type}) = (String(Legolas.schema_name($schema_type)), Legolas.schema_version($schema_type), getfield(row, :fields)) - Arrow.ArrowTypes.JuliaType(::Val{$legolas_row_arrow_name}, ::Any) = Legolas.Row{$schema_type} - Arrow.ArrowTypes.fromarrow(::Type{<:Legolas.Row{$schema_type}}, name, version, fields) = Legolas.Row{$schema_type}(fields) - - - Legolas.Row{$schema_type} - end -end - -# More Arrow serialization: here we provide backwards compatibility for `JuliaLang.Legolas.Row` -# serialized tables. -const LEGOLAS_ROW_ARROW_NAME = Symbol("JuliaLang.Legolas.Row") -Arrow.ArrowTypes.arrowname(::Type{<:Legolas.Row}) = LEGOLAS_ROW_ARROW_NAME -Arrow.ArrowTypes.ArrowType(::Type{Legolas.Row{_,F}}) where {_,F} = Tuple{String,Int,F} -Arrow.ArrowTypes.toarrow(row::Legolas.Row{S}) where {S} = (String(Legolas.schema_name(S)), Legolas.schema_version(S), getfield(row, :fields)) -Arrow.ArrowTypes.JuliaType(::Val{LEGOLAS_ROW_ARROW_NAME}, ::Any) = Legolas.Row -Arrow.ArrowTypes.fromarrow(::Type{<:Legolas.Row}, name, version, fields) = Legolas.Row(Legolas.Schema(name, version), fields) diff --git a/src/schemas.jl b/src/schemas.jl new file mode 100644 index 0000000..679b0a0 --- /dev/null +++ b/src/schemas.jl @@ -0,0 +1,694 @@ +##### +##### schema name/identifier parsing/validation +##### + +const ALLOWED_SCHEMA_NAME_CHARACTERS = Char['-', '.', 'a':'z'..., '0':'9'...] + +""" + Legolas.is_valid_schema_name(x::AbstractString) + +Return `true` if `x` is a valid schema name, return `false` otherwise. + +Valid schema names are lowercase, alphanumeric, and may contain hyphens or periods. +""" +is_valid_schema_name(x::AbstractString) = all(i -> i in ALLOWED_SCHEMA_NAME_CHARACTERS, x) + +##### +##### `SchemaVersion` +##### + +""" + Legolas.SchemaVersion{name,version} + +A type representing a particular version of Legolas schema. The relevant `name` (a `Symbol`) +and `version` (an `Integer`) are surfaced as type parameters, allowing them to be utilized for +dispatch. + +For more details and examples, please see `Legolas.jl/examples/tour.jl` and the +"Schema-Related Concepts/Conventions" section of the Legolas.jl documentation. + +The constructor `SchemaVersion{name,version}()` will throw an `ArgumentError` if `version` is +negative. + +See also: [`Legolas.@schema`](@ref) +""" +struct SchemaVersion{n,v} + function SchemaVersion{n,v}() where {n,v} + v isa Integer && v >= 0 || throw(ArgumentError("`version` in `SchemaVersion{_,version}` must be a non-negative integer, received: `($v)::$(typeof(v))`")) + return new{n,v}() + end +end + +""" + Legolas.SchemaVersion(name::AbstractString, version::Integer) + +Return `Legolas.SchemaVersion{Symbol(name),version}()`. + +Throws an `ArgumentError` if `name` is not a valid schema name. + +Prefer using this constructor over `Legolas.SchemaVersion{Symbol(name),version}()` directly. +""" +function SchemaVersion(n::AbstractString, v::Integer) + is_valid_schema_name(n) || throw(ArgumentError("argument is not a valid `Legolas.SchemaVersion` name: \"$n\"")) + return SchemaVersion{Symbol(n),v}() +end + +SchemaVersion(sv::SchemaVersion) = sv + +##### +##### `parse_identifier` +##### + +""" + Legolas.parse_identifier(id::AbstractString) + +Given a valid schema version identifier `id` of the form: + + \$(names[1])@\$(versions[1]) > \$(names[2])@\$(versions[2]) > ... > \$(names[n])@\$(versions[n]) + +return an `n` element `Vector{SchemaVersion}` whose `i`th element is `SchemaVersion(names[i], versions[i])`. + +Throws an `ArgumentError` if the provided string is not a valid schema version identifier. + +For details regarding valid schema version identifiers and their structure, see the +"Schema-Related Concepts/Conventions" section of the Legolas.jl documentation. +""" +function parse_identifier(id::AbstractString) + name_and_version_per_schema = [split(strip(x), '@') for x in split(id, '>')] + results = SchemaVersion[] + invalid = isempty(name_and_version_per_schema) + if !invalid + for nv in name_and_version_per_schema + if length(nv) != 2 + invalid = true + break + end + n, v = nv + v = tryparse(Int, v) + v isa Int && push!(results, SchemaVersion(n, v)) + end + end + (invalid || isempty(results)) && throw(ArgumentError("failed to parse seemingly invalid/malformed schema version identifier string: \"$id\"")) + return results +end + +##### +##### `UnknownSchemaVersionError` +##### + +struct UnknownSchemaVersionError <: Exception + schema_version::SchemaVersion +end + +function Base.showerror(io::IO, e::UnknownSchemaVersionError) + print(io, """ + UnknownSchemaVersionError: encountered unknown Legolas schema version: + + name=\"$(name(e.schema_version))\" + version=$(version(e.schema_version)) + + This generally indicates that this schema has not been declared (i.e. + the corresponding `@schema` and/or `@version` statements have not been + executed) in the current Julia session. + + In practice, this can arise if you try to read a Legolas table with a + prescribed schema, but haven't actually loaded the schema definition + (or commonly, haven't loaded the dependency that contains the schema + definition - check the versions of loaded packages/modules to confirm + your environment is as expected). + + Note that if you're in this particular situation, you can still load the raw + table as-is without Legolas (e.g. via `Arrow.Table(path_to_table)`). + """) + return nothing +end + +##### +##### `SchemaVersion` accessors +##### + +""" + Legolas.name(::Legolas.SchemaVersion{n}) + +Return `n`. +""" +@inline name(::SchemaVersion{n}) where {n} = n + +""" + Legolas.version(::Legolas.SchemaVersion{n,v}) + +Return `v`. +""" +@inline version(::SchemaVersion{n,v}) where {n,v} = v + +""" + Legolas.parent(sv::Legolas.SchemaVersion) + +Return the `Legolas.SchemaVersion` instance that corresponds to `sv`'s declared parent. +""" +@inline parent(::SchemaVersion) = nothing + +""" + Legolas.declared(sv::Legolas.SchemaVersion{name,version}) + +Return `true` if the schema version `name@version` has been declared via `@version` in the current Julia +session; return `false` otherwise. +""" +@inline declared(::SchemaVersion) = false + +""" + Legolas.identifier(::Legolas.SchemaVersion) + +Return this `Legolas.SchemaVersion`'s fully qualified schema version identifier. This string is serialized +as the `\"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\"` field value in table metadata for table +written via [`Legolas.write`](@ref). +""" +identifier(sv::SchemaVersion) = throw(UnknownSchemaVersionError(sv)) + +""" + Legolas.required_fields(sv::Legolas.SchemaVersion) + +Return a `NamedTuple{...,Tuple{Vararg{DataType}}` whose fields take the form: + + = + +If `sv` has a parent, the returned fields will include `required_fields(parent(sv))`. +""" +required_fields(sv::SchemaVersion) = throw(UnknownSchemaVersionError(sv)) + +""" + Legolas.declaration(sv::Legolas.SchemaVersion) + +Return a `Pair{String,Vector{NamedTuple}}` of the form + + schema_version_identifier::String => required_field_infos::Vector{Legolas.RequiredFieldInfo} + +where `RequiredFieldInfo` has the fields: + +- `name::Symbol`: the required field's name +- `type::Union{Symbol,Expr}`: the required field's declared type constraint +- `parameterize::Bool`: whether or not the required field is exposed as a parameter +- `statement::Expr`: the required field's full assignment statement (as processed by `@version`, not necessarily as written) + +Note that `declaration` is primarily intended to be used for interactive discovery purposes, and +does not include the contents of `declaration(parent(sv))`. +""" +declaration(sv::SchemaVersion) = throw(UnknownSchemaVersionError(sv)) + +##### +##### `SchemaVersion` printing +##### + +Base.show(io::IO, sv::SchemaVersion) = print(io, "SchemaVersion(\"$(name(sv))\", $(version(sv)))") + +##### +##### `SchemaVersion` Arrow (de)serialization +##### + +const LEGOLAS_SCHEMA_VERSION_ARROW_NAME = Symbol("JuliaLang.Legolas.SchemaVersion") +Arrow.ArrowTypes.arrowname(::Type{<:SchemaVersion}) = LEGOLAS_SCHEMA_VERSION_ARROW_NAME +Arrow.ArrowTypes.ArrowType(::Type{<:SchemaVersion}) = String +Arrow.ArrowTypes.toarrow(sv::SchemaVersion) = identifier(sv) +Arrow.ArrowTypes.JuliaType(::Val{LEGOLAS_SCHEMA_VERSION_ARROW_NAME}, ::Any) = SchemaVersion +Arrow.ArrowTypes.fromarrow(::Type{<:SchemaVersion}, id) = first(parse_identifier(id)) + +##### +##### `Tables.Schema` validation +##### + +@inline accepted_field_type(::SchemaVersion, T) = T +accepted_field_type(::SchemaVersion, ::Type{UUID}) = Union{UUID,UInt128} +accepted_field_type(::SchemaVersion, ::Type{Symbol}) = Union{Symbol,String} + +""" + Legolas.find_violation(ts::Tables.Schema, sv::Legolas.SchemaVersion) + +For required field `f::F` of `sv`: + +- Define `A = Legolas.accepted_field_type(sv, F)` +- If `f::T` is present in `ts`, ensure that `T <: A` or else immediately return `f::Symbol => T::DataType`. +- If `f` isn't present in `ts`, ensure that `Missing <: A` or else immediately return `f::Symbol => missing::Missing`. + +Otherwise, return `nothing`. + +See also: [`Legolas.validate`](@ref), [`Legolas.complies_with`](@ref) +""" +find_violation(::Tables.Schema, sv::SchemaVersion) = throw(UnknownSchemaVersionError(sv)) + +function _find_violation end + +""" + Legolas.validate(ts::Tables.Schema, sv::Legolas.SchemaVersion) + +Throws a descriptive `ArgumentError` if `!isnothing(find_violation(ts, sv))`, +otherwise return `nothing`. + +See also: [`Legolas.find_violation`](@ref), [`Legolas.complies_with`](@ref) +""" +function validate(ts::Tables.Schema, sv::SchemaVersion) + result = find_violation(ts, sv) + isnothing(result) && return nothing + field, violation = result + ismissing(violation) && throw(ArgumentError("could not find expected field `$field` in $ts")) + expected = getfield(required_fields(sv), field) + throw(ArgumentError("field `$field` has unexpected type; expected <:$expected, found $violation")) +end + +""" + Legolas.complies_with(ts::Tables.Schema, sv::Legolas.SchemaVersion) + +Return `isnothing(find_violation(ts, sv))`. + +See also: [`Legolas.find_violation`](@ref), [`Legolas.validate`](@ref) +""" +complies_with(ts::Tables.Schema, sv::SchemaVersion) = isnothing(find_violation(ts, sv)) + +##### +##### `AbstractRecord` +##### + +abstract type AbstractRecord <: Tables.AbstractRow end + +@inline Tables.getcolumn(r::AbstractRecord, i::Int) = getfield(r, i) +@inline Tables.getcolumn(r::AbstractRecord, nm::Symbol) = getfield(r, nm) +@inline Tables.columnnames(r::AbstractRecord) = fieldnames(typeof(r)) +@inline Tables.schema(::AbstractVector{R}) where {R<:AbstractRecord} = Tables.Schema(fieldnames(R), fieldtypes(R)) + +##### +##### `@schema` +##### + +schema_name_from_prefix(::Val) = nothing + +""" + @schema "name" Prefix + +Declare a Legolas schema with the given `name`. Types generated by subsequent +[`@version`](@ref) declarations for this schema will be prefixed with `Prefix`. + +For more details and examples, please see `Legolas.jl/examples/tour.jl`. +""" +macro schema(schema_name, schema_prefix) + schema_name isa String || return :(throw(ArgumentError("`name` provided to `@schema` must be a string literal"))) + occursin('@', schema_name) && return :(throw(ArgumentError("`name` provided to `@schema` should not include an `@` version clause"))) + is_valid_schema_name(schema_name) || return :(throw(ArgumentError("`name` provided to `@schema` is not a valid `Legolas.SchemaVersion` name: \"" * $schema_name * "\""))) + schema_prefix isa Symbol || return :(throw(ArgumentError(string("`Prefix` provided to `@schema` is not a valid type name: ", $(Base.Meta.quot(schema_prefix)))))) + return quote + Legolas.schema_name_from_prefix(::Val{$(Base.Meta.quot(schema_prefix))}) = $schema_name + end +end + +##### +##### `@version` +##### + +struct SchemaVersionDeclarationError <: Exception + message::String +end + +SchemaVersionDeclarationError(x, y, args...) = SchemaVersionDeclarationError(string(x, y, args...)) + +function Base.showerror(io::IO, e::SchemaVersionDeclarationError) + print(io, """ + SchemaVersionDeclarationError: $(e.message) + + Note that valid `@version` declarations meet these expectations: + + - `@version`'s first argument must be of the form `RecordType` or + `RecordType > ParentRecordType`, where a valid record type name + takes the form \$(Prefix)V\$(n)` where `Prefix` is a symbol registered + for a particular schema via a prior `@schema` declaration and `n` + is a non-negative integer literal. + + - `@version` declarations must list at least one required field, + and must not list duplicate fields within the same declaration. + """) +end + +struct RequiredFieldInfo + name::Symbol + type::Union{Symbol,Expr} + parameterize::Bool + statement::Expr +end + +Base.:(==)(a::RequiredFieldInfo, b::RequiredFieldInfo) = all(getfield(a, i) == getfield(b, i) for i in 1:fieldcount(RequiredFieldInfo)) + +function _parse_required_field_info!(f) + f isa Symbol && (f = Expr(:(::), f, :Any)) + f.head == :(::) && (f = Expr(:(=), f, f.args[1])) + f.head == :(=) && f.args[1] isa Symbol && (f.args[1] = Expr(:(::), f.args[1], :Any)) + f.head == :(=) && f.args[1].head == :(::) || error("couldn't normalize field expression: $f") + type = f.args[1].args[2] + parameterize = false + if type isa Expr && type.head == :(<:) + type = type.args[1] + parameterize = true + end + return RequiredFieldInfo(f.args[1].args[1], type, parameterize, f) +end + +function _has_valid_child_field_types(child_fields::NamedTuple, parent_fields::NamedTuple) + for (name, child_type) in pairs(child_fields) + if haskey(parent_fields, name) + child_type <: parent_fields[name] || return false + end + end + return true +end + +_validate_wrt_parent(::NamedTuple, ::Nothing) = nothing + +function _validate_wrt_parent(child_fields::NamedTuple, parent::SchemaVersion) + declared(parent) || throw(SchemaVersionDeclarationError("parent schema version cannot be used before it has been declared: $parent")) + _has_valid_child_field_types(child_fields, required_fields(parent)) || throw(SchemaVersionDeclarationError("declared field types violate parent's field types")) + return nothing +end + +function _check_for_expected_field(schema::Tables.Schema, name::Symbol, ::Type{T}) where {T} + i = findfirst(==(name), schema.names) + if isnothing(i) + Missing <: T || return missing + else + schema.types[i] <: T || return schema.types[i] + end + return nothing +end + +function _generate_validation_definitions(schema_version::SchemaVersion) + field_violation_check_statements = Expr[] + for (fname, ftype) in pairs(required_fields(schema_version)) + fname = Base.Meta.quot(fname) + push!(field_violation_check_statements, quote + S = $Legolas.accepted_field_type(sv, $ftype) + result = $Legolas._check_for_expected_field(ts, $fname, S) + isnothing(result) || return $fname => result + end) + end + return quote + function $(Legolas).find_violation(ts::$(Tables).Schema, sv::$(Base.Meta.quot(typeof(schema_version)))) + $(field_violation_check_statements...) + return nothing + end + end +end + +function _record_type end # overloaded by `@version` + +# Note also that this function's implementation is allowed to "observe" `Legolas.required_fields(parent)` +# (if a parent exists), but is NOT allowed to "observe" `Legolas.declaration(parent)`, since the latter +# includes the parent's declared field RHS statements. We cannot interpolate/incorporate these statements +# in the child's record type definition because they may reference bindings from the parent's `@version` +# callsite that are not available/valid at the child's `@version` callsite. +function _generate_record_type_definitions(schema_version::SchemaVersion, record_type_symbol::Symbol) + # generate `schema_version_type_alias_definition` + T = Symbol(string(record_type_symbol, "SchemaVersion")) + schema_version_type_alias_definition = :(const $T = $(Base.Meta.quot(typeof(schema_version)))) + + # generate building blocks for record type definitions + record_fields = required_fields(schema_version) + _, declared_field_infos = declaration(schema_version) + declared_field_infos = Dict(f.name => f for f in declared_field_infos) + type_param_defs = Expr[] + names_of_parameterized_fields = Symbol[] + field_definitions = Expr[] + field_assignments = Expr[] + for (fname, ftype) in pairs(record_fields) + fdef = :($fname::$(Base.Meta.quot(ftype))) + info = get(declared_field_infos, fname, nothing) + if !isnothing(info) + fstmt = info.statement + if info.parameterize + T = Symbol("_", string(fname, "_T")) + push!(type_param_defs, :($T <: $(info.type))) + push!(names_of_parameterized_fields, fname) + fdef = :($fname::$T) + fstmt = :($fname = $(fstmt.args[2])) + end + push!(field_assignments, fstmt) + end + push!(field_definitions, fdef) + end + + # generate `parent_record_application` + field_kwargs = [Expr(:kw, n, :missing) for n in keys(record_fields)] + parent_record_application = nothing + parent = Legolas.parent(schema_version) + if !isnothing(parent) + p = gensym() + P = Base.Meta.quot(_record_type(parent)) + parent_record_field_names = keys(required_fields(parent)) + parent_record_application = quote + $p = $P(; $(parent_record_field_names...)) + $((:($n = $p.$n) for n in parent_record_field_names)...) + end + end + + # generate `inner_constructor_definitions` and `outer_constructor_definitions` + R = record_type_symbol + kwargs_from_row = [Expr(:kw, n, :(get(row, $(Base.Meta.quot(n)), missing))) for n in keys(record_fields)] + outer_constructor_definitions = :($R(row) = $R(; $(kwargs_from_row...))) + if isempty(type_param_defs) + inner_constructor_definitions = quote + function $R(; $(field_kwargs...)) + $parent_record_application + $(field_assignments...) + return new($(keys(record_fields)...)) + end + end + else + type_param_names = [p.args[1] for p in type_param_defs] + inner_constructor_definitions = quote + function $R{$(type_param_names...)}(; $(field_kwargs...)) where {$(type_param_names...)} + $parent_record_application + $(field_assignments...) + return new{$(type_param_names...)}($(keys(record_fields)...)) + end + function $R(; $(field_kwargs...)) + $parent_record_application + $(field_assignments...) + return new{$((:(typeof($n)) for n in names_of_parameterized_fields)...)}($(keys(record_fields)...)) + end + end + outer_constructor_definitions = quote + $outer_constructor_definitions + $R{$(type_param_names...)}(row) where {$(type_param_names...)} = $R{$(type_param_names...)}(; $(kwargs_from_row...)) + end + end + + # generate `base_overload_definitions` + equal_rhs_statement = foldr((x, y) -> :($x && $y), (:(a.$f == b.$f) for f in keys(record_fields))) + isequal_rhs_statement = foldr((x, y) -> :($x && $y), (:(isequal(a.$f, b.$f)) for f in keys(record_fields))) + hash_rhs_statement = foldr((x, y) -> :(hash($x, $y)), (:(r.$f) for f in keys(record_fields)); init=:h) + base_overload_definitions = quote + Base.:(==)(a::$R, b::$R) = $equal_rhs_statement + Base.isequal(a::$R, b::$R) = $isequal_rhs_statement + Base.hash(r::$R, h::UInt) = hash($R, $hash_rhs_statement) + Base.NamedTuple(r::$R) = (; $((:(r.$f) for f in keys(record_fields))...)) + end + + # generate `arrow_overload_definitions` + record_type_arrow_name = Base.Meta.quot(Symbol("JuliaLang.Legolas.Generated.$R")) + arrow_overload_definitions = quote + $Arrow.ArrowTypes.arrowname(::Type{<:$R}) = $record_type_arrow_name + $Arrow.ArrowTypes.ArrowType(::Type{R}) where {R<:$R} = NamedTuple{fieldnames(R),Tuple{fieldtypes(R)...}} + $Arrow.ArrowTypes.toarrow(r::$R) = NamedTuple(r) + $Arrow.ArrowTypes.JuliaType(::Val{$record_type_arrow_name}, ::Any) = $R + $Arrow.ArrowTypes.fromarrow(::Type{<:$R}, $(keys(record_fields)...)) = $R(; $(keys(record_fields)...)) + end + + return quote + $schema_version_type_alias_definition + struct $R{$(type_param_defs...)} <: $Legolas.AbstractRecord + $(field_definitions...) + $inner_constructor_definitions + end + $outer_constructor_definitions + $base_overload_definitions + $arrow_overload_definitions + end +end + +function _parse_record_type_symbol(t::Symbol) + pv = split(string(t), 'V') + if length(pv) == 2 + p, v = pv + p = Symbol(p) + v = tryparse(Int, v) + if v isa Int + n = schema_name_from_prefix(Val(p)) + n isa String || return SchemaVersionDeclarationError("provided record type symbol references undeclared schema: ", t) + return n, p, v + end + end + return SchemaVersionDeclarationError("provided record type symbol is malformed: ", t) +end + + +""" + @version RecordType begin + required_field_expression_1 + required_field_expression_2 + ⋮ + end + + @version RecordType > ParentRecordType begin + required_field_expression_1 + required_field_expression_2 + ⋮ + end + +Given a prior `@schema` declaration of the form: + + @schema "example.name" Name + +...the `n`th version of `example.name` can be declared via a `@version` declaration of the form: + + @version NameV\$(n) begin + required_field_expression_1 + required_field_expression_2 + ⋮ + end + +...which generates types definitions for the `NameV\$(n)` type (a `Legolas.AbstractRecord` subtype) and +`NameV\$(n)SchemaVersion` type (an alias of `typeof(SchemaVersion("example.name", n))`), as well as the +necessary definitions to overload relevant Legolas methods with specialized behaviors in accordance with +the declared required fields. + +If the declared schema version has a parent, it should be specified via the optional `> ParentRecordType` +clause. + +Each `required_field_expression` specifies a required field of the declared schema version, and is an +expression of the form `field::F = rhs` where: + +- `field` is the corresponding field's name +- `::F` denotes the field's type constraint (if elided, defaults to `::Any`). +- `rhs` is the expression which produces `field::F` (if elided, defaults to `field`). + +Accounting for all of the aforementioned allowed elisions, valid `required_field_expression`s include: + +- `field::F = rhs` +- `field::F` (interpreted as `field::F = field`) +- `field = rhs` (interpreted as `field::Any = rhs`) +- `field` (interpreted as `field::Any = field`) + +`F` is generally a type literal, but may also be an expression of the form `(<:T)`, in which case +the declared schema version's generated record type will expose a type parameter (constrained to be +a subtype of `T`) for the given field. For example: + + julia> @schema "example.foo" Foo + + julia> @version FooV1 begin + x::Int + y::(<:Real) + end + + julia> FooV1(x=1, y=2.0) + FooV1{Float64}: (x = 1, y = 2.0) + + julia> FooV1{Float32}(x=1, y=2) + FooV1{Float32}: (x = 1, y = 2.0f0) + + julia> FooV1(x=1, y="bad") + ERROR: TypeError: in FooV1, in _y_T, expected _y_T<:Real, got Type{String} + +This macro will throw a `Legolas.SchemaVersionDeclarationError` if: + +- The provided `RecordType` does not follow the `\$(Prefix)V\$(n)` format, where `Prefix` was + previously associated with a given schema by a prior `@schema` declaration. +- There are no required field expressions, duplicate required fields are declared, a given + required field expression is invalid. +- (if a parent is specified) The `@version` declaration does not comply with its parent's + `@version` declaration, or the parent hasn't yet been declared at all. + +Note that this macro expects to be evaluated within top-level scope. + +For more details and examples, please see `Legolas.jl/examples/tour.jl` and the +"Schema-Related Concepts/Conventions" section of the Legolas.jl documentation. +""" +macro version(record_type, required_field_statements) + if record_type isa Symbol + parent_record_type = nothing + elseif record_type isa Expr && record_type.head == :call && length(record_type.args) == 3 && + record_type.args[1] == :> && + record_type.args[2] isa Symbol && + record_type.args[3] isa Symbol + parent_record_type = record_type.args[3] + record_type = record_type.args[2] + else + return :(throw(SchemaVersionDeclarationError("provided record type expression is malformed: ", $(Base.Meta.quot(record_type))))) + end + + x = _parse_record_type_symbol(record_type) + x isa SchemaVersionDeclarationError && return :(throw($x)) + schema_name, _, schema_integer = x + schema_version = SchemaVersion(schema_name, schema_integer) + quoted_schema_version = Base.Meta.quot(schema_version) + quoted_schema_version_type = Base.Meta.quot(typeof(schema_version)) + parent = nothing + if !isnothing(parent_record_type) + x = _parse_record_type_symbol(parent_record_type) + x isa SchemaVersionDeclarationError && return :(throw($x)) + parent_name, _, parent_integer = x + parent_name == schema_name && return :(throw(SchemaVersionDeclarationError("cannot extend from a different version of the same schema"))) + parent = SchemaVersion(parent_name, parent_integer) + end + quoted_parent = Base.Meta.quot(parent) + + # parse `required_field_statements` + if !(required_field_statements isa Expr && required_field_statements.head == :block && !isempty(required_field_statements.args)) + return :(throw(SchemaVersionDeclarationError("malformed or missing declaration of required fields"))) + end + required_field_statements = [f for f in required_field_statements.args if !(f isa LineNumberNode)] + required_field_infos = RequiredFieldInfo[] + for stmt in required_field_statements + original_stmt = Base.Meta.quot(deepcopy(stmt)) + try + push!(required_field_infos, _parse_required_field_info!(stmt)) + catch + return :(throw(SchemaVersionDeclarationError("malformed `@version` field expression: ", $original_stmt))) + end + end + if !allunique(f.name for f in required_field_infos) + msg = string("cannot have duplicate field names in `@version` declaration; recieved: ", [f.name for f in required_field_infos]) + return :(throw(SchemaVersionDeclarationError($msg))) + end + field_names_types = Expr(:tuple, (:($(f.name) = $(esc(f.type))) for f in required_field_infos)...) + + # basic accessor function definitions + full_identifier_string = string(schema_name, '@', schema_integer) + child_identifier_string = full_identifier_string + required_field_names_types = field_names_types + if !isnothing(parent) + full_identifier_string = :(string($full_identifier_string, '>', Legolas.identifier($quoted_parent))) + child_identifier_string = string(child_identifier_string, '>', name(parent), '@', version(parent)) + required_field_names_types = :(merge(Legolas.required_fields($quoted_parent), $required_field_names_types)) + end + schema_version_declaration = :($child_identifier_string => copy($(Base.Meta.quot(required_field_infos)))) + check_against_declaration = :($child_identifier_string => $(Base.Meta.quot(required_field_infos))) + + return quote + if Legolas.declared($quoted_schema_version) && Legolas.declaration($quoted_schema_version) != $check_against_declaration + throw(SchemaVersionDeclarationError("invalid redeclaration of existing schema version; all `@version` redeclarations must exactly match previous declarations")) + else + Legolas._validate_wrt_parent($field_names_types, $quoted_parent) + + @inline Legolas.declared(::$quoted_schema_version_type) = true + + @inline Legolas.identifier(::$quoted_schema_version_type) = $full_identifier_string + + @inline Legolas.parent(::$quoted_schema_version_type) = $quoted_parent + + Legolas.required_fields(::$quoted_schema_version_type) = $required_field_names_types + + Legolas.declaration(::$quoted_schema_version_type) = $schema_version_declaration + + $(esc(:eval))(Legolas._generate_validation_definitions($quoted_schema_version)) + + $(esc(:eval))(Legolas._generate_record_type_definitions($quoted_schema_version, $(Base.Meta.quot(record_type)))) + + Legolas._record_type(::$quoted_schema_version_type) = $(esc(record_type)) + end + nothing + end +end diff --git a/src/tables.jl b/src/tables.jl index 126405d..22aee02 100644 --- a/src/tables.jl +++ b/src/tables.jl @@ -1,3 +1,7 @@ +##### +##### Tables.jl operations/utilities +##### + function _columns(table) try return Tables.columns(table) @@ -7,153 +11,6 @@ function _columns(table) end end -##### -##### validate tables -##### - -""" - Legolas.extract_schema(table) - -Attempt to extract Arrow metadata from `table` via `Arrow.getmetadata(table)`. - -If Arrow metadata is present and contains `\"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\" => s`, return [`Legolas.Schema(s)`](@ref). - -Otherwise, return `nothing`. -""" -function extract_schema(table) - metadata = Arrow.getmetadata(table) - if !isnothing(metadata) - for (k, v) in metadata - k == LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY && return Schema(v) - end - end - return nothing -end - -""" - Legolas.validate(table, legolas_schema::Legolas.Schema) - -Attempt to determine `s::Tables.Schema` from `table` and return `Legolas.validate(s, legolas_schema)`. - -If a `Tables.Schema` cannot be determined, a warning message is logged and `nothing` is returned. -""" -function validate(table, legolas_schema::Schema) - columns = _columns(table) - Tables.rowcount(columns) > 0 || return nothing - tables_schema = Tables.schema(columns) - if tables_schema isa Tables.Schema - try - validate(tables_schema, legolas_schema) - catch - @warn "provided table's `Tables.Schema` does not appear to match provided `Legolas.Schema`. Run `[Legolas.Row($legolas_schema, r) for r in Tables.rows(t)]` to try converting the table `t` to a compatible representation." - rethrow() - end - else - @warn "could not determine `Tables.Schema` from provided table; skipping schema validation" - end - return nothing -end - -""" - Legolas.validate(table) - -If [`Legolas.extract_schema(table)`](@ref) returns a valid `Legolas.Schema`, return `Legolas.validate(table, Legolas.extract_schema(table))`. - -Otherwise, if a `Legolas.Schema` isn't found or is invalid, an `ArgumentError` is thrown. -""" -function validate(table) - schema = Legolas.extract_schema(table) - isnothing(schema) && throw(ArgumentError(""" - could not extract valid `Legolas.Schema` from provided Arrow table; - is it missing the expected custom metadata and/or the expected - \"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\" field? - """)) - return validate(table, schema) -end - -##### -##### read/write tables -##### - -""" - Legolas.read(io_or_path; validate::Bool=true) - -Read and return an `Arrow.Table` from `io_or_path`. - -If `validate` is `true`, `Legolas.validate` will be called on the table before it is returned. - -Note that `io_or_path` may be any type that supports `Base.read(io_or_path)::Vector{UInt8}`. -""" -function read(io_or_path; validate::Bool=true) - table = read_arrow(io_or_path) - validate && Legolas.validate(table) - return table -end - -""" - Legolas.write(io_or_path, table, schema::Schema; validate::Bool=true, kwargs...) - -Write `table` to `io_or_path`, inserting the appropriate `$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY` -field in the written out Arrow metadata. - -If `validate` is `true`, `Legolas.validate` will be called on the table before it written out. - -Any other provided `kwargs` are forwarded to an internal invocation of `Arrow.write`. - -Note that `io_or_path` may be any type that supports `Base.write(io_or_path, bytes::Vector{UInt8})`. -""" -function write(io_or_path, table, schema::Schema; validate::Bool=true, - metadata=Arrow.getmetadata(table), kwargs...) - validate && Legolas.validate(table, schema) - schema_metadata = LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => schema_qualified_string(schema) - if isnothing(metadata) - metadata = (schema_metadata,) - else - metadata = Set(metadata) - push!(metadata, schema_metadata) - end - write_arrow(io_or_path, table; metadata=metadata, kwargs...) - return table -end - -""" - Legolas.tobuffer(args...; kwargs...) - -A convenience function that constructs a fresh `io::IOBuffer`, calls -`Legolas.write(io, args...; kwargs...)`, and returns `seekstart(io)`. - -Analogous to the `Arrow.tobuffer` function. -""" -function tobuffer(args...; kwargs...) - io = IOBuffer() - Legolas.write(io, args...; kwargs...) - seekstart(io) - return io -end - -##### -##### read/write Arrow content to generic path types -##### -# It would be better if Arrow.jl supported a generic API for nonstandard path-like types so that -# we can avoid potential intermediate copies here, but its documentation is explicit that it only -# supports `Union{IO,String}`. -# -# TODO: upstream improvements to Arrow.jl to obviate these? - -write_full_path(path::AbstractString, bytes) = (mkpath(dirname(path)); Base.write(path, bytes)) -write_full_path(path, bytes) = Base.write(path, bytes) - -read_arrow(io_or_path::Union{IO,String,Vector{UInt8}}) = Arrow.Table(io_or_path) -read_arrow(path) = read_arrow(Base.read(path)) - -write_arrow(path::String, table; kwargs...) = Arrow.write(path, table; kwargs...) -write_arrow(io::IO, table; kwargs...) = Arrow.write(io, table; file=get(kwargs, :file, true), kwargs...) -write_arrow(path, table; kwargs...) = (io = IOBuffer(); write_arrow(io, table; kwargs...); write_full_path(path, take!(io))) - -##### -##### Tables.jl operations -##### - """ locations(collections::Tuple) @@ -205,7 +62,7 @@ function _iterator_for_column(table, c) end """ - gather(column_name, tables...; extract=((table, idxs) -> view(table, idxs, :))) + Legolas.gather(column_name, tables...; extract=((table, idxs) -> view(table, idxs, :))) Gather rows from `tables` into a unified cross-table index along `column_name`. Returns a `Dict` whose keys are the unique values of `column_name` across `tables`, and whose @@ -229,7 +86,7 @@ function gather(column_name, tables::Vararg{Any,N}; end """ - materialize(table) + Legolas.materialize(table) Return a fully deserialized copy of `table`. @@ -240,3 +97,144 @@ such access costs upfront before repeatedly accessing the table. Note that we intend to eventually migrate this function from Legolas.jl to a more appropriate package. """ materialize(table) = map(collect, Tables.columntable(table)) + +##### +##### read/write Arrow content to generic path types +##### +# It would be better if Arrow.jl supported a generic API for nonstandard path-like types so that +# we can avoid potential intermediate copies here, but its documentation is explicit that it only +# supports `Union{IO,String}`. +# +# TODO: upstream improvements to Arrow.jl to obviate these? + +write_full_path(path::AbstractString, bytes) = (mkpath(dirname(path)); Base.write(path, bytes)) +write_full_path(path, bytes) = Base.write(path, bytes) + +read_arrow(io_or_path::Union{IO,String,Vector{UInt8}}) = Arrow.Table(io_or_path) +read_arrow(path) = read_arrow(Base.read(path)) + +write_arrow(path::String, table; kwargs...) = Arrow.write(path, table; kwargs...) +write_arrow(io::IO, table; kwargs...) = Arrow.write(io, table; file=get(kwargs, :file, true), kwargs...) +write_arrow(path, table; kwargs...) = (io = IOBuffer(); write_arrow(io, table; kwargs...); write_full_path(path, take!(io))) + +##### +##### `extract_schema_version` +##### + +""" + Legolas.extract_schema_version(table) + +Attempt to extract Arrow metadata from `table` via `Arrow.getmetadata(table)`. + +If Arrow metadata is present and contains `\"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\" => s`, +return `first(parse_identifier(s))` + +Otherwise, return `nothing`. +""" +function extract_schema_version(table) + metadata = Arrow.getmetadata(table) + if !isnothing(metadata) + for (k, v) in metadata + k == LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY && return first(parse_identifier(v)) + end + end + return nothing +end + +##### +##### `read`/`write` +##### + +""" + Legolas.read(io_or_path; validate::Bool=true) + +Read and return an `Arrow.Table` from `io_or_path`. + +If `validate` is `true`, `Legolas.read` will attempt to extract a `Legolas.SchemaVersion` from +the deserialized `Arrow.Table`'s metadata and use `Legolas.validate` to verify that the table's +`Table.Schema` complies with the extracted `Legolas.SchemaVersion` before returning the table. + +Note that `io_or_path` may be any type that supports `Base.read(io_or_path)::Vector{UInt8}`. +""" +function read(io_or_path; validate::Bool=true) + table = read_arrow(io_or_path) + if validate + sv = extract_schema_version(table) + isnothing(sv) && throw(ArgumentError(""" + could not extract valid `Legolas.SchemaVersion` from the `Arrow.Table` read + via `Legolas.read`; is it missing the expected custom metadata and/or the + expected \"$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY\" field? + """)) + try + Legolas.validate(Tables.schema(table), sv) + catch + @warn """ + The `Tables.Schema` of the `Arrow.Table` read via `Legolas.read(io_or_path)` does not appear to + comply with the `Legolas.SchemaVersion` indicated by the table's metadata (`$sv`). Try invoking + `Legolas.read(io_or_path; validate=false)` to inspect the table. + """ + rethrow() + end + end + return table +end + +""" + Legolas.write(io_or_path, table, sv::SchemaVersion; validate::Bool=true, kwargs...) + +Write `table` to `io_or_path`, inserting the appropriate `$LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY` +field in the written out Arrow metadata. + +If `validate` is `true`, `Legolas.validate(Tables.schema(table), vs)` will be invoked before the +table is written out to `io_or_path`. + +Any other provided `kwargs` are forwarded to an internal invocation of `Arrow.write`. + +Note that `io_or_path` may be any type that supports `Base.write(io_or_path, bytes::Vector{UInt8})`. +""" +function write(io_or_path, table, sv::SchemaVersion; validate::Bool=true, + metadata=Arrow.getmetadata(table), kwargs...) + if validate + table_schema = Tables.schema(table) + if table_schema isa Tables.Schema + try + Legolas.validate(table_schema, sv) + catch + @warn """ + The table provided to `Legolas.write` does not appear to comply with the provided `Legolas.SchemaVersion` + according to `Legolas.validate`. You may attempt to construct a schema-compliant table by executing + `[R(r) for r in Tables.rows(table)]` where `R` is the relevant schema version's record type, or disable + validation-on-write by passing `validate=false` to `Legolas.write`. + """ + rethrow() + end + else + @warn "could not determine `Tables.Schema` from table provided to `Legolas.write`; skipping schema validation" + end + end + schema_metadata = LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => identifier(sv) + if isnothing(metadata) + metadata = (schema_metadata,) + else + metadata = Set(metadata) + push!(metadata, schema_metadata) + end + write_arrow(io_or_path, table; metadata=metadata, kwargs...) + return table +end + +""" + Legolas.tobuffer(args...; kwargs...) + +A convenience function that constructs a fresh `io::IOBuffer`, calls +`Legolas.write(io, args...; kwargs...)`, and returns `seekstart(io)`. + +Analogous to the `Arrow.tobuffer` function. +""" +function tobuffer(args...; kwargs...) + io = IOBuffer() + Legolas.write(io, args...; kwargs...) + seekstart(io) + return io +end + diff --git a/test/runtests.jl b/test/runtests.jl index 7d8ae1c..6cdc23f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,5 @@ -using Legolas, Test, DataFrames, Arrow - -using Legolas: Schema, @row, Row +using Legolas, Test, DataFrames, Arrow, UUIDs +using Legolas: SchemaVersion, @schema, @version, SchemaVersionDeclarationError, RequiredFieldInfo include(joinpath(dirname(@__DIR__), "examples", "tour.jl")) @@ -109,93 +108,278 @@ end @test Legolas._iterator_for_column(a, :x) == dfa.x end -@testset "miscellaneous Legolas/src/tables.jl tests" begin - struct MyPath - x::String +bad_id_message(x) = "failed to parse seemingly invalid/malformed schema version identifier string: \"$x\"" +bad_name_message(x) = "argument is not a valid `Legolas.SchemaVersion` name: \"$x\"" +bad_version_message(x) = "`version` in `SchemaVersion{_,version}` must be a non-negative integer, received: `($x)::$(typeof(x))`" + +@testset "Legolas.parse_identifier and related code" begin + good_schema_names = ("foo", "test.foo", "test.foo-bar", ".-technically-allowed-.") + good_versions = (0, 1, 2, 3) + bad_schema_names = ("has_underscore", "caPitaLs", "has a space", "illegal?chars*") + bad_versions = (-1, -2, -3) + + for n in good_schema_names, v in good_versions + @test Legolas.parse_identifier("$n@$v") == [SchemaVersion(n, v)] + @test SchemaVersion(n, v) == SchemaVersion{Symbol(n),v}() + @test Legolas.name(SchemaVersion(n, v)) == Symbol(n) + @test Legolas.version(SchemaVersion(n, v)) == v end - Base.read(p::MyPath) = Base.read(p.x) - Base.write(p::MyPath, bytes) = Base.write(p.x, bytes) - root = mktempdir() - path = MyPath(joinpath(root, "baz.arrow")) - Baz = @row("baz@1", a, b) - t = [Baz(a=1, b=2), Baz(a=3, b=4)] - Legolas.write(path, t, Schema("baz", 1)) - @test t == Baz.(Tables.rows(Legolas.read(path))) - tbl = Arrow.Table(Legolas.tobuffer(t, Schema("baz", 1); metadata=("a" => "b", "c" => "d"))) - @test Set(Arrow.getmetadata(tbl)) == Set((Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "baz@1", - "a" => "b", "c" => "d")) - struct Foo - meta + for n in good_schema_names, v in bad_versions + @test_throws ArgumentError(bad_version_message(v)) SchemaVersion(n, v) + id = "$n@$v" + @test_throws ArgumentError(bad_version_message(v)) Legolas.parse_identifier(id) end - Legolas.Arrow.getmetadata(foo::Foo) = foo.meta - foo = Foo(Dict("a" => "b", "b" => "b", - Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "baz@1")) - @test Legolas.Schema("baz", 1) == Legolas.extract_schema(foo) - t = [(a="a", c=1, b="b"), Baz(a=1, b=2)] # not a valid Tables.jl table - @test_throws ErrorException Legolas.validate(t, Schema("baz", 1)) + for n in bad_schema_names, v in Iterators.flatten((bad_versions, good_versions)) + @test_throws ArgumentError(bad_name_message(n)) SchemaVersion(n, v) + id = "$n@$v" + @test_throws ArgumentError(bad_name_message(n)) Legolas.parse_identifier(id) + end - t = Arrow.tobuffer((a=[1, 2], b=[3, 4]); metadata=Dict(Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "haha@3")) - @test_throws Legolas.UnknownSchemaError Legolas.read(t) -end + for n in good_schema_names, m in good_schema_names + @test Legolas.parse_identifier("$n@3>$m@2") == [SchemaVersion(n, 3), SchemaVersion(m, 2)] + @test Legolas.parse_identifier("$n@1>$m@0>bob@3") == [SchemaVersion(n, 1), SchemaVersion(m, 0), SchemaVersion("bob", 3)] + @test Legolas.parse_identifier("$n@1 >$m@0 > bob@3 ") == [SchemaVersion(n, 1), SchemaVersion(m, 0), SchemaVersion("bob", 3)] + id = "$n@1 >$m @0 > bob@3 " + @test_throws ArgumentError(bad_name_message("$m ")) Legolas.parse_identifier(id) + for bad_id in ("$n>$m@1", + "$n@1>$m", + "$n@>$m@", + "$n>$m@", + "$n@>$m", + "$n>$m") + @test_throws ArgumentError(bad_id_message(bad_id)) Legolas.parse_identifier(bad_id) + end + for bad_separator in ("<", "<:", ":", "=") + id = "$n@1" * bad_separator * "$m@0" + @test_throws ArgumentError(bad_id_message(id)) Legolas.parse_identifier(id) + end + end -@testset "miscellaneous Legolas.Schema / Legolas.Row tests" begin - @test_throws ArgumentError("`Legolas.Schema` version must be non-negative, recieved: -1") Schema("good_name", -1) - @test_throws ArgumentError("argument is not a valid `Legolas.Schema` name: \"bad_name?\"") Schema("bad_name?", 1) - @test_throws ArgumentError("argument is not a valid `Legolas.Schema` string: \"bad_name>?@1\"") Schema("bad_name>?@1") + for good in good_schema_names, bad in bad_schema_names + for id in ("$good@3>$bad@2", + "$bad@1>bob@0>$good@3", + "bob@1>$bad@0>$good@3", + "$good@1>bob@0>$bad@3") + @test_throws ArgumentError(bad_name_message(bad)) Legolas.parse_identifier(id) + end + end +end - @row("foo@1", x, y) - @row("bar@1" > "foo@1", z) - @test Legolas.schema_parent(Schema("bar", 1)) == Schema("foo", 1) +@testset "Legolas.@schema" begin + @test_throws ArgumentError("`name` provided to `@schema` must be a string literal") @schema(joe, J) + @test_throws ArgumentError("`name` provided to `@schema` should not include an `@` version clause") @schema("joe@1", J) + @test_throws ArgumentError("`name` provided to `@schema` is not a valid `Legolas.SchemaVersion` name: \"joe?\"") @schema("joe?", J) + @test_throws ArgumentError("`Prefix` provided to `@schema` is not a valid type name: J{Int}") @schema("joo", J{Int}) +end - r = Row(Schema("bar", 1), (x=1, y=2, z=3)) +@schema "test.parent" Parent +@version ParentV1 begin + x::Vector + y::AbstractString +end - @test propertynames(r) == (:z, :x, :y) - @test r === Row(Schema("bar", 1), r) - @test r === Row(Schema("bar", 1); x=1, y=2, z=3) - @test r === Row(Schema("bar", 1), first(Tables.rows(Arrow.Table(Arrow.tobuffer((x=[1],y=[2],z=[3])))))) - @test r[1] === 3 - @test string(r) == "Row(Schema(\"bar@1\"), (z = 3, x = 1, y = 2))" +@schema "test.child" Child +@version ChildV1 > ParentV1 begin + z +end - tbl = Arrow.Table(Arrow.tobuffer((x=[r],))) - @test r === tbl.x[1] +@schema "test.grandchild" Grandchild +@version GrandchildV1 > ChildV1 begin + a::Int32 = round(Int32, a) + y::String = string(y[1:2]) +end - long_row = Row(Schema("bar", 1), (x=1, y=2, z=zeros(100, 100))) - @test length(sprint(show, long_row; context=(:limit => true))) < 200 +@schema "test.nested" Nested +@version NestedV1 begin + gc::GrandchildV1 + k::(<:Any) +end - @test_throws Legolas.UnknownSchemaError Legolas.transform(Legolas.Schema("imadethisup@3"); a = 1, b = 2) - @test_throws Legolas.UnknownSchemaError Legolas.validate(Tables.Schema((:a, :b), (Int, Int)), Legolas.Schema("imadethisup@3")) - @test_throws Legolas.UnknownSchemaError Legolas.schema_qualified_string(Legolas.Schema("imadethisup@3")) +@schema "test.nested-again" NestedAgain +@version NestedAgainV1 begin + n::(<:NestedV1) + h::(<:Any) +end - sch = Schema("bar", 1) - @test Schema(sch) == sch +# This statement will induce an error if field types are not properly escaped, +# since `DataFrame` will be hygeine-passed to `Legolas.DataFrame`, which is undefined +@schema "test.field-type-escape" FieldTypeEscape +@version FieldTypeEscapeV1 begin + x::DataFrame +end - schemas = [Schema("bar", 1), Schema("foo", 1)] - tbl = Arrow.Table(Arrow.tobuffer((; schema=schemas))) - @test all(tbl.schema .== schemas) +@schema "test.accepted" Accepted +@version AcceptedV1 begin + id::UUID + sym::Symbol end -@testset "isequal, hash" begin - TestRow = @row("testrow@1", x, y) +@schema "test.new" New + +@testset "`Legolas.@version` and associated utilities for declared `Legolas.SchemaVersion`s" begin + @testset "Legolas.SchemaVersionDeclarationError" begin + @test_throws SchemaVersionDeclarationError("malformed or missing declaration of required fields") @version(ChildV2, begin end) + @test_throws SchemaVersionDeclarationError("provided record type symbol references undeclared schema: UnknownV1") @version(UnknownV1 > ChildV1, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol references undeclared schema: UnknownV1") @version(ChildV1 > UnknownV1, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: Child") @version(Child, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: Childv2") @version(Childv2, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: ChildV") @version(ChildV, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: ChildVTwo") @version(ChildVTwo, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: Parent") @version(ChildV1 > Parent, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: Parentv2") @version(ChildV1 > Parentv2, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: ParentV") @version(ChildV1 > ParentV, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type symbol is malformed: ParentVTwo") @version(ChildV1 > ParentVTwo, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type expression is malformed: BobV1 > DaveV1 > JoeV1") @version(BobV1 > DaveV1 > JoeV1, begin x end) + @test_throws SchemaVersionDeclarationError("provided record type expression is malformed: BobV1 < DaveV1") @version(BobV1 < DaveV1, begin x end) + @test_throws SchemaVersionDeclarationError("cannot have duplicate field names in `@version` declaration; recieved: $([:x, :y, :x, :z])") @version(ChildV2, begin x; y; x; z end) + @test_throws SchemaVersionDeclarationError("parent schema version cannot be used before it has been declared: SchemaVersion(\"test.parent\", 2)") @version(ChildV2 > ParentV2, begin x end) + @test_throws SchemaVersionDeclarationError("parent schema version cannot be used before it has been declared: SchemaVersion(\"test.new\", 1)") @version(ChildV2 > NewV1, begin y::Int end) + @test_throws SchemaVersionDeclarationError("cannot extend from a different version of the same schema") @version(ChildV2 > ChildV1, begin x end) + @test_throws SchemaVersionDeclarationError("declared field types violate parent's field types") @version(NewV1 > ParentV1, begin y::Int end) + @test_throws SchemaVersionDeclarationError("declared field types violate parent's field types") @version(NewV1 > ChildV1, begin y::Int end) + @test_throws SchemaVersionDeclarationError("invalid redeclaration of existing schema version; all `@version` redeclarations must exactly match previous declarations") @version(ParentV1, begin x; y end) + @test_throws SchemaVersionDeclarationError("malformed `@version` field expression: f()") @version(ChildV2, begin f() end) + end + + undeclared = SchemaVersion("undeclared", 3) + + @testset "Legolas.declared" begin + @test !Legolas.declared(undeclared) + @test all(Legolas.declared, (ParentV1SchemaVersion(), ChildV1SchemaVersion(), GrandchildV1SchemaVersion())) + end + + @testset "Legolas.parent" begin + @test isnothing(Legolas.parent(undeclared)) + @test isnothing(Legolas.parent(ParentV1SchemaVersion())) + @test Legolas.parent(ChildV1SchemaVersion()) == ParentV1SchemaVersion() + @test Legolas.parent(GrandchildV1SchemaVersion()) == ChildV1SchemaVersion() + end + + @testset "Legolas.identifier" begin + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.identifier(undeclared) + @test Legolas.identifier(ParentV1SchemaVersion()) == "test.parent@1" + @test Legolas.identifier(ChildV1SchemaVersion()) == "test.child@1>test.parent@1" + @test Legolas.identifier(GrandchildV1SchemaVersion()) == "test.grandchild@1>test.child@1>test.parent@1" + end + + @testset "Legolas.required_fields" begin + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.required_fields(undeclared) + @test Legolas.required_fields(ParentV1SchemaVersion()) == (x=Vector, y=AbstractString) + @test Legolas.required_fields(ChildV1SchemaVersion()) == (x=Vector, y=AbstractString, z=Any) + @test Legolas.required_fields(GrandchildV1SchemaVersion()) == (x=Vector, y=String, z=Any, a=Int32) + end + + @testset "Legolas.find_violation + Legolas.complies_with + Legolas.validate" begin + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.validate(Tables.Schema((:a, :b), (Int, Int)), undeclared) + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.complies_with(Tables.Schema((:a, :b), (Int, Int)), undeclared) + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.find_violation(Tables.Schema((:a, :b), (Int, Int)), undeclared) + + # Note that many of the basic properties of `find_violation`/`complies_with`/`validate` + # are unit-tested in `examples/tour.jl`; thus, we focus here on testing that these + # functions work as expected w.r.t. schema extension in particular. + + t = Tables.Schema((:a, :y, :z), (Int32, String, Any)) + for s in (GrandchildV1SchemaVersion(), ChildV1SchemaVersion(), ParentV1SchemaVersion()) + @test_throws ArgumentError("could not find expected field `x` in $t") Legolas.validate(t, s) + @test !Legolas.complies_with(t, s) + @test isequal(Legolas.find_violation(t, s), :x => missing) + end + + t = Tables.Schema((:x, :a, :y), (ComplexF64, Int32, String)) + for s in (GrandchildV1SchemaVersion(), ChildV1SchemaVersion(), ParentV1SchemaVersion()) + @test_throws ArgumentError("field `x` has unexpected type; expected <:$(Vector), found $(Complex{Float64})") Legolas.validate(t, s) + @test !Legolas.complies_with(t, s) + @test isequal(Legolas.find_violation(t, s), :x => ComplexF64) + end + + t = Tables.Schema((:x, :a, :y), (Vector, Int32, String)) + for s in (GrandchildV1SchemaVersion(), ChildV1SchemaVersion(), ParentV1SchemaVersion()) + @test isnothing(Legolas.validate(t, s)) + @test Legolas.complies_with(t, s) + @test isnothing(Legolas.find_violation(t, s)) + end + + for T in (UUID, UInt128), S in (Symbol, String) + @test Legolas.complies_with(Tables.Schema((:id, :sym), (T, S)), AcceptedV1SchemaVersion()) + end + end + + @testset "Legolas.declaration" begin + @test_throws Legolas.UnknownSchemaVersionError(undeclared) Legolas.declaration(undeclared) + @test Legolas.declaration(ParentV1SchemaVersion()) == ("test.parent@1" => [RequiredFieldInfo(:x, :Vector, false, :(x::Vector = x)), + RequiredFieldInfo(:y, :AbstractString, false, :(y::AbstractString = y))]) + @test Legolas.declaration(ChildV1SchemaVersion()) == ("test.child@1>test.parent@1" => [RequiredFieldInfo(:z, :Any, false, :(z::Any = z))]) + @test Legolas.declaration(GrandchildV1SchemaVersion()) == ("test.grandchild@1>test.child@1" => [RequiredFieldInfo(:a, :Int32, false, :(a::Int32 = round(Int32, a))), + RequiredFieldInfo(:y, :String, false, :(y::String = string(y[1:2])))]) + end + + r0 = (x=[42], y="foo", z=:three, a=1.3) + r0_arrow = first(Tables.rows(Arrow.Table(Arrow.tobuffer([r0])))) + + @test NamedTuple(ParentV1(r0)) == (; r0.x, r0.y) + @test ParentV1(r0) == ParentV1(; r0.x, r0.y) + @test ParentV1(r0) == ParentV1(r0_arrow) + + @test NamedTuple(ChildV1(r0)) == (; r0.x, r0.y, r0.z) + @test ChildV1(r0) == ChildV1(; r0.x, r0.y, r0.z) + @test ChildV1(r0) == ChildV1(r0_arrow) + + @test NamedTuple(GrandchildV1(r0)) == (x=[42], y="fo", z=:three, a=1) + @test GrandchildV1(r0) == GrandchildV1(; r0.x, r0.y, r0.z, r0.a) + @test GrandchildV1(r0) == GrandchildV1(r0_arrow) - foo = TestRow(; x = [1]) - foo2 = TestRow(; x = [1]) - @test isequal(foo, foo2) - @test hash(foo) == hash(foo2) + tbl = Arrow.Table(Arrow.tobuffer((; x=[ParentV1(r0)]))) + @test tbl.x[1] == ParentV1(Tables.rowmerge(r0)) - foo3 = TestRow(; x = [3]) - @test !isequal(foo, foo3) - @test hash(foo) != hash(foo3) + # Note that Arrow.jl roundtrips z=:three to z="three", since + # `z::Symbol` isn't evident from these record types + r0_roundtripped = Tables.rowmerge(r0; z="three") + + tbl = Arrow.Table(Arrow.tobuffer((; x=[ChildV1(r0)]))) + @test tbl.x[1] == ChildV1(r0_roundtripped) + + tbl = Arrow.Table(Arrow.tobuffer((; x=[GrandchildV1(r0)]))) + @test tbl.x[1] == GrandchildV1(r0_roundtripped) + + svs = [GrandchildV1SchemaVersion(), ChildV1SchemaVersion(), ParentV1SchemaVersion()] + tbl = Arrow.Table(Arrow.tobuffer((; sv=svs))) + @test all(tbl.sv .== svs) + + tbl = [NestedV1(; gc=GrandchildV1(r0), k="test")] + roundtripped = Legolas.read(Legolas.tobuffer(tbl, NestedV1SchemaVersion())) + @test roundtripped.gc[1] == GrandchildV1(r0_roundtripped) + @test roundtripped.k[1] == "test" + + tbl = [NestedAgainV1(; n=NestedV1(; gc=GrandchildV1(r0), k="test"), h=3)] + roundtripped = Legolas.read(Legolas.tobuffer(tbl, NestedAgainV1SchemaVersion())) + @test roundtripped.n[1] == NestedV1(; gc=GrandchildV1(r0_roundtripped), k="test") + @test roundtripped.h[1] == 3 end -const MyInnerRow = @row("my-inner-schema@1", b::Int=1) -const MyOuterRow = @row("my-outer-schema@1", - a::String, - x::MyInnerRow=MyInnerRow(x)) +@testset "miscellaneous Legolas/src/tables.jl tests" begin + struct MyPath + x::String + end + Base.read(p::MyPath) = Base.read(p.x) + Base.write(p::MyPath, bytes) = Base.write(p.x, bytes) + root = mktempdir() + path = MyPath(joinpath(root, "baz.arrow")) + t = [(x=[1,2], y="hello"), (x=[3,4], y="bye")] + Legolas.write(path, t, ParentV1SchemaVersion()) + @test t == [NamedTuple(ParentV1(r)) for r in Tables.rows(Legolas.read(path))] + tbl = Arrow.Table(Legolas.tobuffer(t, ParentV1SchemaVersion(); metadata=("a" => "b", "c" => "d"))) + @test Set(Arrow.getmetadata(tbl)) == Set((Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "test.parent@1", + "a" => "b", "c" => "d")) + + struct Moo + meta + end + Legolas.Arrow.getmetadata(moo::Moo) = moo.meta + moo = Moo(Dict("a" => "b", "b" => "b", Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "test.parent@1")) + @test ParentV1SchemaVersion() == Legolas.extract_schema_version(moo) -@testset "Nested arrow serialization" begin - table = [MyOuterRow(; a="outer_a", x = MyInnerRow())] - roundtripped_table = Legolas.read(Legolas.tobuffer(table, Legolas.Schema("my-outer-schema@1"))) - @test table == MyOuterRow.(Tables.rows(roundtripped_table)) + t = Arrow.tobuffer((a=[1, 2], b=[3, 4]); metadata=Dict(Legolas.LEGOLAS_SCHEMA_QUALIFIED_METADATA_KEY => "haha@3")) + @test_throws Legolas.UnknownSchemaVersionError Legolas.read(t) end