➕ Add embedding transformer and more test encoder tests

FluxML · Aug 5, 2024 · d2fc2a9 · d2fc2a9
1 parent 3e67625
commit d2fc2a9
Show file tree

Hide file tree

Showing 2 changed files with 222 additions and 0 deletions.
diff --git a/src/encoders.jl b/src/encoders.jl
@@ -0,0 +1,150 @@
+"""
+File containing ordinal encoder and entity embedding encoder. Borrows code from the MLJTransforms package.
+"""
+
+### Ordinal Encoder
+"""
+**Private Method**
+
+Fits an ordinal encoder to the table `X`, using only the columns with indices in `featinds`.
+
+Returns a dictionary mapping each column index to a dictionary mapping each level in that column to an integer.
+"""
+function ordinal_encoder_fit(X; featinds)
+    # 1. Define mapping per column per level dictionary
+    mapping_matrix = Dict()
+
+    # 2. Use feature mapper to compute the mapping of each level in each column
+    for i in featinds
+        feat_col = Tables.getcolumn(X, i)
+        feat_levels = levels(feat_col)
+        # Check if feat levels is already ordinal encoded in which case we skip
+        (Set(1:length(feat_levels)) == Set(feat_levels)) && continue
+        # Compute the dict using the given feature_mapper function
+        mapping_matrix[i] =
+            Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
+    end
+    return mapping_matrix
+end
+
+"""
+**Private Method**
+
+Checks that all levels in `test_levels` are also in `train_levels`. If not, throws an error.
+"""
+function check_unkown_levels(train_levels, test_levels)
+    # test levels must be a subset of train levels
+    if !issubset(test_levels, train_levels)
+        # get the levels in test that are not in train
+        lost_levels = setdiff(test_levels, train_levels)
+        error(
+            "While transforming, found novel levels for the column: $(lost_levels) that were not seen while training.",
+        )
+    end
+end
+
+"""
+**Private Method**
+
+Transforms the table `X` using the ordinal encoder defined by `mapping_matrix`.
+
+Returns a new table with the same column names as `X`, but with categorical columns replaced by integer columns.
+"""
+function ordinal_encoder_transform(X, mapping_matrix)
+    isnothing(mapping_matrix) && return X
+    isempty(mapping_matrix) && return X
+    feat_names = Tables.schema(X).names
+    numfeats = length(feat_names)
+    new_feats = []
+    for ind in 1:numfeats
+        col = Tables.getcolumn(X, ind)
+
+        # Create the transformation function for each column
+        if ind in keys(mapping_matrix)
+            train_levels = keys(mapping_matrix[ind])
+            test_levels = levels(col)
+            check_unkown_levels(train_levels, test_levels)
+            level2scalar = mapping_matrix[ind]
+            new_col = recode(col, level2scalar...)
+            push!(new_feats, new_col)
+        else
+            push!(new_feats, col)
+        end
+    end
+
+    transformed_X = NamedTuple{tuple(feat_names...)}(tuple(new_feats)...)
+    # Attempt to preserve table type
+    transformed_X = Tables.materializer(X)(transformed_X)
+    return transformed_X
+end
+
+"""
+**Private Method**
+
+Combine ordinal_encoder_fit and ordinal_encoder_transform and return both X and ordinal_mappings
+"""
+function ordinal_encoder_fit_transform(X; featinds)
+    ordinal_mappings = ordinal_encoder_fit(X; featinds = featinds)
+    return ordinal_encoder_transform(X, ordinal_mappings), ordinal_mappings
+end
+
+
+
+## Entity Embedding Encoder (assuming precomputed weights)
+"""
+**Private method.**
+
+Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
+"""
+function generate_new_feat_names(feat_name, num_inds, existing_names)
+    conflict = true        # will be kept true as long as there is a conflict
+    count = 1            # number of conflicts+1 = number of underscores
+
+    new_column_names = []
+    while conflict
+        suffix = repeat("_", count)
+        new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
+        conflict = any(name -> name in existing_names, new_column_names)
+        count += 1
+    end
+    return new_column_names
+end
+
+
+"""
+Given X and a dict of mapping_matrices that map each categorical column to a matrix, use the matrix to transform
+each level in each categorical columns using the columns of the matrix.
+
+This is used with the embedding matrices of the entity embedding layer in entity enabled models to implement entity embeddings.
+"""
+function embedding_transform(X, mapping_matrices)
+    isnothing(mapping_matrices) && return X
+    feat_names = Tables.schema(X).names
+    new_feat_names = Symbol[]
+    new_cols = []
+    for feat_name in feat_names
+        col = Tables.getcolumn(X, feat_name)
+        # Create the transformation function for each column
+        if feat_name in keys(mapping_matrices)
+            level2vector = mapping_matrices[feat_name]
+            new_multi_col = map(x -> level2vector[:, unwrap(x)], col)
+            new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
+            push!(new_cols, new_multi_col...)
+            feat_names_with_inds = generate_new_feat_names(
+                feat_name,
+                size(level2vector, 1),
+                feat_names,
+            )
+            push!(new_feat_names, feat_names_with_inds...)
+        else
+            # Not to be transformed => left as is
+            push!(new_feat_names, feat_name)
+            push!(new_cols, col)
+        end
+    end
+
+    transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
+    # Attempt to preserve table type
+    transformed_X = Tables.materializer(X)(transformed_X)
+    return transformed_X
+end
diff --git a/test/encoders.jl b/test/encoders.jl
@@ -0,0 +1,72 @@
+
+
+@testset "ordinal encoder" begin
+    X = (
+        Column1 = [1, 2, 3, 4, 5],
+        Column2 = categorical(['a', 'b', 'c', 'd', 'e']),
+        Column3 = categorical(["b", "c", "d"]),
+        Column4 = [1.0, 2.0, 3.0, 4.0, 5.0],
+    )
+    map = MLJFlux.ordinal_encoder_fit(X; featinds = [2, 3])
+    Xenc = MLJFlux.ordinal_encoder_transform(X, map)
+    @test map[2] == Dict('a' => 1, 'b' => 2, 'c' => 3, 'd' => 4, 'e' => 5)
+    @test map[3] == Dict("b" => 1, "c" => 2, "d" => 3)
+    @test Xenc.Column1 == [1, 2, 3, 4, 5]
+    @test Xenc.Column2 == [1, 2, 3, 4, 5]
+    @test Xenc.Column3 == [1, 2, 3]
+    @test Xenc.Column4 == [1.0, 2.0, 3.0, 4.0, 5.0]
+
+    X = coerce(X, :Column1 => Multiclass)
+    map = MLJFlux.ordinal_encoder_fit(X; featinds = [1, 2, 3])
+    @test !haskey(map, 1)   # already encoded
+
+    @test Xenc == MLJFlux.ordinal_encoder_fit_transform(X; featinds = [2, 3])[1]
+end
+
+@testset "Generate New feature names Function Tests" begin
+    # Test 1: No initial conflicts
+    @testset "No Initial Conflicts" begin
+        existing_names = []
+        names = MLJFlux.generate_new_feat_names("feat", 3, existing_names)
+        @test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
+    end
+
+    # Test 2: Handle initial conflict by adding underscores
+    @testset "Initial Conflict Resolution" begin
+        existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
+        names = MLJFlux.generate_new_feat_names("feat", 3, existing_names)
+        @test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
+    end
+end
+
+
+@testset "embedding_transform works" begin
+    X = (
+        Column1 = [1, 2, 3, 4, 5],
+        Column2 = categorical(['a', 'b', 'c', 'd', 'e']),
+        Column3 = categorical(["b", "c", "d", "f", "f"]),
+        Column4 = [1.0, 2.0, 3.0, 4.0, 5.0],
+    )
+    mapping_matrices = Dict(
+        :Column2 => [
+            1 0.5 0.7 4 5
+            0.4 2 3 0.9 0.2
+            0.1 0.6 0.8 0.3 0.4
+        ],
+        :Column3 => [
+            1 0.5 0.7 4
+            0.4 2 3 0.9
+        ],
+    )
+    X, _ = MLJFlux.ordinal_encoder_fit_transform(X; featinds = [2, 3])
+    Xenc = MLJFlux.embedding_transform(X, mapping_matrices)
+    @test Xenc == (
+        Column1 = [1, 2, 3, 4, 5],
+        Column2_1 = [1.0, 0.5, 0.7, 4.0, 5.0],
+        Column2_2 = [0.4, 2.0, 3.0, 0.9, 0.2],
+        Column2_3 = [0.1, 0.6, 0.8, 0.3, 0.4],
+        Column3_1 = [1.0, 0.5, 0.7, 4.0, 4.0],
+        Column3_2 = [0.4, 2.0, 3.0, 0.9, 0.9],
+        Column4 = [1.0, 2.0, 3.0, 4.0, 5.0],
+    )
+end