Skip to content

Commit

Permalink
➕ Add ordinal encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
EssamWisam committed Aug 4, 2024
1 parent fb98c27 commit 3e67625
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 1 deletion.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

Expand Down
4 changes: 3 additions & 1 deletion src/MLJFlux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export CUDALibs, CPU1
import Flux
using MLJModelInterface
using MLJModelInterface.ScientificTypesBase
using ScientificTypes: schema, Finite
import Base.==
using ProgressMeter
using CategoricalArrays
Expand All @@ -28,6 +29,7 @@ include("classifier.jl")
include("image.jl")
include("mlj_model_interface.jl")
include("entity_embedding.jl")
include("ordinal_encoder.jl")

export NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor
export NeuralNetworkClassifier, NeuralNetworkBinaryClassifier, ImageClassifier
Expand All @@ -37,4 +39,4 @@ export CategoricalEmbedder
include("deprecated.jl")


end #module
end
79 changes: 79 additions & 0 deletions src/ordinal_encoder.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
**Private Method**
Returns the indices of the categorical columns in the table `X`.
"""
function get_cat_inds(X)
types = schema(X).scitypes
cat_inds = findall(x -> x <: Finite, types)
return cat_inds
end


"""
**Private Method**
Fits an ordinal encoder to the table `X`, using only the columns with indices in `featinds`.
Returns a dictionary mapping each column index to a dictionary mapping each level in that column to an integer.
"""
function ordinal_encoder_fit(X; featinds)
# 1. Define mapping per column per level dictionary
mapping_per_feat_level = Dict()

# 2. Use feature mapper to compute the mapping of each level in each column
for i in featinds
feat_col = Tables.getcolumn(X, i)
feat_levels = levels(feat_col)
# Compute the dict using the given feature_mapper function
mapping_per_feat_level[i] = Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
end
return mapping_per_feat_level
end

"""
**Private Method**
Checks that all levels in `test_levels` are also in `train_levels`. If not, throws an error.
"""
function check_unkown_levels(train_levels, test_levels)
# test levels must be a subset of train levels
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error("While transforming, found novel levels for the column: $(lost_levels) that were not seen while training.")
end
end

"""
**Private Method**
Transforms the table `X` using the ordinal encoder defined by `mapping_per_feat_level`.
Returns a new table with the same column names as `X`, but with categorical columns replaced by integer columns.
"""
function ordinal_encoder_transform(X, mapping_per_feat_level)
feat_names = Tables.schema(X).names
numfeats = length(feat_names)
new_feats = []
for ind in 1:numfeats
col = Tables.getcolumn(X, ind)

# Create the transformation function for each column
if ind in keys(mapping_per_feat_level)
train_levels = keys(mapping_per_feat_level[ind])
test_levels = levels(col)
check_unkown_levels(train_levels, test_levels)
level2scalar = mapping_per_feat_level[ind]
new_col = recode(col, level2scalar...)
push!(new_feats, new_col)
else
push!(new_feats, col)
end
end

transformed_X = NamedTuple{tuple(feat_names...)}(tuple(new_feats)...)
# Attempt to preserve table type
transformed_X = Tables.materializer(X)(transformed_X)
return transformed_X
end
31 changes: 31 additions & 0 deletions test/ordinal_encoder.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
@testset "test get_cat_inds" begin
X = (
C1 = [1, 2, 3, 4, 5],
C2 = ['a', 'b', 'c', 'd', 'e'],
C3 = ["b", "c", "d", "e", "f"],
C4 = [1.0, 2.0, 3.0, 4.0, 5.0]
)
X = coerce(X, :C1=>OrderedFactor,:C2=>Multiclass, :C3=>Multiclass)
@test MLJFlux.get_cat_inds(X) == [1, 2, 3]
end


@testset "ordinal encoder" begin
X = (
Column1 = [1, 2, 3, 4, 5],
Column2 = categorical(['a', 'b', 'c', 'd', 'e']),
Column3 = categorical(["b", "c", "d"]),
Column4 = [1.0, 2.0, 3.0, 4.0, 5.0]
)
map = MLJFlux.ordinal_encoder_fit(X; featinds = [2, 3])
Xenc = MLJFlux.ordinal_encoder_transform(X, map)
@test map[2] == Dict('a' => 1, 'b' => 2, 'c' => 3, 'd' => 4, 'e' => 5)
@test map[3] == Dict("b" => 1, "c" => 2, "d" => 3 )
@test Xenc.Column1 == [1, 2, 3, 4, 5]
@test Xenc.Column2 == [1, 2, 3, 4, 5]
@test Xenc.Column3 == [1, 2, 3]
@test Xenc.Column4 == [1.0, 2.0, 3.0, 4.0, 5.0]
end



5 changes: 5 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using Tables
using MLJBase
import MLJFlux
using CategoricalArrays
using ScientificTypes: coerce, Multiclass, OrderedFactor
using ColorTypes
using Flux
import Random
Expand Down Expand Up @@ -78,3 +79,7 @@ end
@conditional_testset "entity_embedding.jl" begin
include("entity_embedding.jl")
end

@conditional_testset "ordinal_encoder.jl" begin
include("ordinal_encoder.jl")
end

0 comments on commit 3e67625

Please sign in to comment.