-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
➕ Add embedding transformer and more test encoder tests
- Loading branch information
1 parent
3e67625
commit d2fc2a9
Showing
2 changed files
with
222 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
""" | ||
File containing ordinal encoder and entity embedding encoder. Borrows code from the MLJTransforms package. | ||
""" | ||
|
||
### Ordinal Encoder | ||
""" | ||
**Private Method** | ||
Fits an ordinal encoder to the table `X`, using only the columns with indices in `featinds`. | ||
Returns a dictionary mapping each column index to a dictionary mapping each level in that column to an integer. | ||
""" | ||
function ordinal_encoder_fit(X; featinds) | ||
# 1. Define mapping per column per level dictionary | ||
mapping_matrix = Dict() | ||
|
||
# 2. Use feature mapper to compute the mapping of each level in each column | ||
for i in featinds | ||
feat_col = Tables.getcolumn(X, i) | ||
feat_levels = levels(feat_col) | ||
# Check if feat levels is already ordinal encoded in which case we skip | ||
(Set(1:length(feat_levels)) == Set(feat_levels)) && continue | ||
# Compute the dict using the given feature_mapper function | ||
mapping_matrix[i] = | ||
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels)) | ||
end | ||
return mapping_matrix | ||
end | ||
|
||
""" | ||
**Private Method** | ||
Checks that all levels in `test_levels` are also in `train_levels`. If not, throws an error. | ||
""" | ||
function check_unkown_levels(train_levels, test_levels) | ||
# test levels must be a subset of train levels | ||
if !issubset(test_levels, train_levels) | ||
# get the levels in test that are not in train | ||
lost_levels = setdiff(test_levels, train_levels) | ||
error( | ||
"While transforming, found novel levels for the column: $(lost_levels) that were not seen while training.", | ||
) | ||
end | ||
end | ||
|
||
""" | ||
**Private Method** | ||
Transforms the table `X` using the ordinal encoder defined by `mapping_matrix`. | ||
Returns a new table with the same column names as `X`, but with categorical columns replaced by integer columns. | ||
""" | ||
function ordinal_encoder_transform(X, mapping_matrix) | ||
isnothing(mapping_matrix) && return X | ||
isempty(mapping_matrix) && return X | ||
feat_names = Tables.schema(X).names | ||
numfeats = length(feat_names) | ||
new_feats = [] | ||
for ind in 1:numfeats | ||
col = Tables.getcolumn(X, ind) | ||
|
||
# Create the transformation function for each column | ||
if ind in keys(mapping_matrix) | ||
train_levels = keys(mapping_matrix[ind]) | ||
test_levels = levels(col) | ||
check_unkown_levels(train_levels, test_levels) | ||
level2scalar = mapping_matrix[ind] | ||
new_col = recode(col, level2scalar...) | ||
push!(new_feats, new_col) | ||
else | ||
push!(new_feats, col) | ||
end | ||
end | ||
|
||
transformed_X = NamedTuple{tuple(feat_names...)}(tuple(new_feats)...) | ||
# Attempt to preserve table type | ||
transformed_X = Tables.materializer(X)(transformed_X) | ||
return transformed_X | ||
end | ||
|
||
""" | ||
**Private Method** | ||
Combine ordinal_encoder_fit and ordinal_encoder_transform and return both X and ordinal_mappings | ||
""" | ||
function ordinal_encoder_fit_transform(X; featinds) | ||
ordinal_mappings = ordinal_encoder_fit(X; featinds = featinds) | ||
return ordinal_encoder_transform(X, ordinal_mappings), ordinal_mappings | ||
end | ||
|
||
|
||
|
||
## Entity Embedding Encoder (assuming precomputed weights) | ||
""" | ||
**Private method.** | ||
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n | ||
""" | ||
function generate_new_feat_names(feat_name, num_inds, existing_names) | ||
conflict = true # will be kept true as long as there is a conflict | ||
count = 1 # number of conflicts+1 = number of underscores | ||
|
||
new_column_names = [] | ||
while conflict | ||
suffix = repeat("_", count) | ||
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] | ||
conflict = any(name -> name in existing_names, new_column_names) | ||
count += 1 | ||
end | ||
return new_column_names | ||
end | ||
|
||
|
||
""" | ||
Given X and a dict of mapping_matrices that map each categorical column to a matrix, use the matrix to transform | ||
each level in each categorical columns using the columns of the matrix. | ||
This is used with the embedding matrices of the entity embedding layer in entity enabled models to implement entity embeddings. | ||
""" | ||
function embedding_transform(X, mapping_matrices) | ||
isnothing(mapping_matrices) && return X | ||
feat_names = Tables.schema(X).names | ||
new_feat_names = Symbol[] | ||
new_cols = [] | ||
for feat_name in feat_names | ||
col = Tables.getcolumn(X, feat_name) | ||
# Create the transformation function for each column | ||
if feat_name in keys(mapping_matrices) | ||
level2vector = mapping_matrices[feat_name] | ||
new_multi_col = map(x -> level2vector[:, unwrap(x)], col) | ||
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))] | ||
push!(new_cols, new_multi_col...) | ||
feat_names_with_inds = generate_new_feat_names( | ||
feat_name, | ||
size(level2vector, 1), | ||
feat_names, | ||
) | ||
push!(new_feat_names, feat_names_with_inds...) | ||
else | ||
# Not to be transformed => left as is | ||
push!(new_feat_names, feat_name) | ||
push!(new_cols, col) | ||
end | ||
end | ||
|
||
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) | ||
# Attempt to preserve table type | ||
transformed_X = Tables.materializer(X)(transformed_X) | ||
return transformed_X | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
|
||
|
||
@testset "ordinal encoder" begin | ||
X = ( | ||
Column1 = [1, 2, 3, 4, 5], | ||
Column2 = categorical(['a', 'b', 'c', 'd', 'e']), | ||
Column3 = categorical(["b", "c", "d"]), | ||
Column4 = [1.0, 2.0, 3.0, 4.0, 5.0], | ||
) | ||
map = MLJFlux.ordinal_encoder_fit(X; featinds = [2, 3]) | ||
Xenc = MLJFlux.ordinal_encoder_transform(X, map) | ||
@test map[2] == Dict('a' => 1, 'b' => 2, 'c' => 3, 'd' => 4, 'e' => 5) | ||
@test map[3] == Dict("b" => 1, "c" => 2, "d" => 3) | ||
@test Xenc.Column1 == [1, 2, 3, 4, 5] | ||
@test Xenc.Column2 == [1, 2, 3, 4, 5] | ||
@test Xenc.Column3 == [1, 2, 3] | ||
@test Xenc.Column4 == [1.0, 2.0, 3.0, 4.0, 5.0] | ||
|
||
X = coerce(X, :Column1 => Multiclass) | ||
map = MLJFlux.ordinal_encoder_fit(X; featinds = [1, 2, 3]) | ||
@test !haskey(map, 1) # already encoded | ||
|
||
@test Xenc == MLJFlux.ordinal_encoder_fit_transform(X; featinds = [2, 3])[1] | ||
end | ||
|
||
@testset "Generate New feature names Function Tests" begin | ||
# Test 1: No initial conflicts | ||
@testset "No Initial Conflicts" begin | ||
existing_names = [] | ||
names = MLJFlux.generate_new_feat_names("feat", 3, existing_names) | ||
@test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] | ||
end | ||
|
||
# Test 2: Handle initial conflict by adding underscores | ||
@testset "Initial Conflict Resolution" begin | ||
existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")] | ||
names = MLJFlux.generate_new_feat_names("feat", 3, existing_names) | ||
@test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")] | ||
end | ||
end | ||
|
||
|
||
@testset "embedding_transform works" begin | ||
X = ( | ||
Column1 = [1, 2, 3, 4, 5], | ||
Column2 = categorical(['a', 'b', 'c', 'd', 'e']), | ||
Column3 = categorical(["b", "c", "d", "f", "f"]), | ||
Column4 = [1.0, 2.0, 3.0, 4.0, 5.0], | ||
) | ||
mapping_matrices = Dict( | ||
:Column2 => [ | ||
1 0.5 0.7 4 5 | ||
0.4 2 3 0.9 0.2 | ||
0.1 0.6 0.8 0.3 0.4 | ||
], | ||
:Column3 => [ | ||
1 0.5 0.7 4 | ||
0.4 2 3 0.9 | ||
], | ||
) | ||
X, _ = MLJFlux.ordinal_encoder_fit_transform(X; featinds = [2, 3]) | ||
Xenc = MLJFlux.embedding_transform(X, mapping_matrices) | ||
@test Xenc == ( | ||
Column1 = [1, 2, 3, 4, 5], | ||
Column2_1 = [1.0, 0.5, 0.7, 4.0, 5.0], | ||
Column2_2 = [0.4, 2.0, 3.0, 0.9, 0.2], | ||
Column2_3 = [0.1, 0.6, 0.8, 0.3, 0.4], | ||
Column3_1 = [1.0, 0.5, 0.7, 4.0, 4.0], | ||
Column3_2 = [0.4, 2.0, 3.0, 0.9, 0.9], | ||
Column4 = [1.0, 2.0, 3.0, 4.0, 5.0], | ||
) | ||
end |