➕ Add ordinal encoder

FluxML · Aug 4, 2024 · 3e67625 · 3e67625
1 parent fb98c27
commit 3e67625
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 1 deletion.
diff --git a/Project.toml b/Project.toml
@@ -13,6 +13,7 @@ Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 

diff --git a/src/MLJFlux.jl b/src/MLJFlux.jl
@@ -5,6 +5,7 @@ export CUDALibs, CPU1
 import Flux
 using MLJModelInterface
 using MLJModelInterface.ScientificTypesBase
+using ScientificTypes: schema, Finite
 import Base.==
 using ProgressMeter
 using CategoricalArrays
@@ -28,6 +29,7 @@ include("classifier.jl")
 include("image.jl")
 include("mlj_model_interface.jl")
 include("entity_embedding.jl")
+include("ordinal_encoder.jl")
 
 export NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor
 export NeuralNetworkClassifier, NeuralNetworkBinaryClassifier, ImageClassifier
@@ -37,4 +39,4 @@ export CategoricalEmbedder
 include("deprecated.jl")
 
 
-end #module
+end
diff --git a/src/ordinal_encoder.jl b/src/ordinal_encoder.jl
@@ -0,0 +1,79 @@
+"""
+**Private Method**
+
+Returns the indices of the categorical columns in the table `X`.
+"""
+function get_cat_inds(X)
+    types = schema(X).scitypes
+    cat_inds = findall(x -> x <: Finite, types)    
+    return cat_inds
+end
+
+
+"""
+**Private Method**
+
+Fits an ordinal encoder to the table `X`, using only the columns with indices in `featinds`.
+
+Returns a dictionary mapping each column index to a dictionary mapping each level in that column to an integer.
+"""
+function ordinal_encoder_fit(X; featinds)
+    # 1. Define mapping per column per level dictionary
+    mapping_per_feat_level = Dict()
+
+    # 2. Use feature mapper to compute the mapping of each level in each column
+    for i in featinds
+        feat_col = Tables.getcolumn(X, i)
+        feat_levels = levels(feat_col)
+        # Compute the dict using the given feature_mapper function
+        mapping_per_feat_level[i] = Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
+    end
+    return mapping_per_feat_level
+end
+
+"""
+**Private Method**
+
+Checks that all levels in `test_levels` are also in `train_levels`. If not, throws an error.
+"""
+function check_unkown_levels(train_levels, test_levels)
+    # test levels must be a subset of train levels
+    if !issubset(test_levels, train_levels)
+        # get the levels in test that are not in train
+        lost_levels = setdiff(test_levels, train_levels)
+        error("While transforming, found novel levels for the column: $(lost_levels) that were not seen while training.")
+    end
+end
+
+"""
+**Private Method**
+
+Transforms the table `X` using the ordinal encoder defined by `mapping_per_feat_level`.
+
+Returns a new table with the same column names as `X`, but with categorical columns replaced by integer columns.
+"""
+function ordinal_encoder_transform(X, mapping_per_feat_level)
+    feat_names = Tables.schema(X).names
+    numfeats = length(feat_names)
+    new_feats = []
+    for ind in 1:numfeats
+        col = Tables.getcolumn(X, ind)
+
+        # Create the transformation function for each column
+        if ind in keys(mapping_per_feat_level)
+            train_levels = keys(mapping_per_feat_level[ind])
+            test_levels = levels(col)
+            check_unkown_levels(train_levels, test_levels)
+            level2scalar = mapping_per_feat_level[ind]
+            new_col = recode(col, level2scalar...)
+            push!(new_feats, new_col)
+        else
+            push!(new_feats, col)
+        end
+    end
+
+    transformed_X = NamedTuple{tuple(feat_names...)}(tuple(new_feats)...)
+    # Attempt to preserve table type
+    transformed_X = Tables.materializer(X)(transformed_X)
+    return transformed_X
+end
diff --git a/test/ordinal_encoder.jl b/test/ordinal_encoder.jl
@@ -0,0 +1,31 @@
+@testset "test get_cat_inds" begin
+    X = (
+        C1 = [1, 2, 3, 4, 5],
+        C2 = ['a', 'b', 'c', 'd', 'e'],
+        C3 = ["b", "c", "d", "e", "f"],
+        C4 = [1.0, 2.0, 3.0, 4.0, 5.0]
+        )
+    X = coerce(X, :C1=>OrderedFactor,:C2=>Multiclass, :C3=>Multiclass)
+    @test MLJFlux.get_cat_inds(X) == [1, 2, 3]    
+end
+
+
+@testset "ordinal encoder" begin
+    X = (
+        Column1 = [1, 2, 3, 4, 5],
+        Column2 = categorical(['a', 'b', 'c', 'd', 'e']),
+        Column3 = categorical(["b", "c", "d"]),
+        Column4 = [1.0, 2.0, 3.0, 4.0, 5.0]
+        )
+    map = MLJFlux.ordinal_encoder_fit(X; featinds = [2, 3])
+    Xenc = MLJFlux.ordinal_encoder_transform(X, map)
+    @test map[2] == Dict('a' => 1, 'b' => 2, 'c' => 3, 'd' => 4, 'e' => 5)
+    @test map[3] == Dict("b" => 1, "c" => 2, "d" => 3 )
+    @test Xenc.Column1 == [1, 2, 3, 4, 5]
+    @test Xenc.Column2 == [1, 2, 3, 4, 5]
+    @test Xenc.Column3 == [1, 2, 3]
+    @test Xenc.Column4 == [1.0, 2.0, 3.0, 4.0, 5.0]
+end
+
+
+
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,6 +3,7 @@ using Tables
 using MLJBase
 import MLJFlux
 using CategoricalArrays
+using ScientificTypes: coerce, Multiclass, OrderedFactor
 using ColorTypes
 using Flux
 import Random
@@ -78,3 +79,7 @@ end
 @conditional_testset "entity_embedding.jl" begin
     include("entity_embedding.jl")
 end
+
+@conditional_testset "ordinal_encoder.jl" begin
+    include("ordinal_encoder.jl")
+end