Skip to content

Commit

Permalink
Hadi (1992) implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
jbytecode committed Sep 1, 2020
1 parent 35f3705 commit feb2991
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 16 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
authors = ["Mehmet Hakan Satman <[email protected]>"]
name = "LinRegOutliers"
uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a"
authors = ["Mehmet Hakan Satman <[email protected]>"]
version = "0.3.2"
version = "0.3.3"

[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ A Julia package for outlier detection in linear regression.
- Satman (2013)
- Satman (2015)
- Setan & Halim & Mohd (2000)
- Least Absolute Deviations

- Least Absolute Deviations (LAD)
- Least Trimmed Absolute Deviations (LTA)
- Hadi (1992)

## Example

Expand Down
22 changes: 13 additions & 9 deletions src/LinRegOutliers.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
module LinRegOutliers

#using GLM
#using DataFrames
#using Distributions
#using Clustering
#using StatsBase
#using LinearAlgebra
#using Plots
#using Optim
# using GLM
# using DataFrames
# using Distributions
# using Clustering
# using StatsBase
# using LinearAlgebra
# using Plots
# using Optim

import GLM: @formula, lm, FormulaTerm, ModelFrame, ModelMatrix, predict, coef, residuals
import DataFrames: DataFrame
Expand Down Expand Up @@ -35,15 +35,19 @@ include("satman2013.jl")
include("satman2015.jl")
include("lad.jl")
include("lta.jl")
include("hadi1992.jl")

# Essentials from other packages
export @formula, DataFrame
export mean, quantile

# Basics
export RegressionSetting
export createRegressionSetting
export designMatrix
export responseVector
export applyColumns
export find_minimum_nonzero

# Data
export phones, hbk
Expand All @@ -70,5 +74,5 @@ export satman2013
export satman2015, dominates
export lad
export lta

export hadi1992
end # module
13 changes: 12 additions & 1 deletion src/basis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ end

function responseVector(setting::RegressionSetting)::Array{Float64,1}
mf = ModelFrame(setting.formula, setting.data)
return setting.data[mf.f.lhs.sym]
return setting.data[:,mf.f.lhs.sym]
end

function applyColumns(f::Function, data::DataFrame)
Expand All @@ -25,3 +25,14 @@ end
function applyColumns(f::Function, data::Matrix)
return [f(col) for col = eachcol(data)]
end

function find_minimum_nonzero(arr::Array{Float64,1})
arr_sorted = sort(arr)
minval = arr_sorted[length(arr)]
for val in arr_sorted
if val < minval && val > 0
minval = val
end
end
return minval
end
74 changes: 74 additions & 0 deletions src/hadi1992.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
function hadi1992_handle_singularity(S::Array{Float64,2})::Array{Float64,2}
p, _ = size(S)
eigen_structure = eigen(S)
values = eigen_structure.values
vectors = eigen_structure.vectors
lambda_s = find_minimum_nonzero(values)
W = zeros(Float64, p, p)
for i in 1:p
W[i, i] = 1 / max(values[i], lambda_s)
end
newS = vectors * W * vectors
return newS
end

function hadi1992(multivariateData::Array{Float64,2}; alpha=0.05)
n, p = size(multivariateData)
h = Int(floor((n + p + 1.0) / 2.0))
chi_50_quantile = quantile(Chisq(p), 0.50)
critical_quantile = quantile(Chisq(p), 1 - alpha)
allindices = collect(1:n)

# Step 0
meds = coordinatwisemedians(multivariateData)
Sm = (1.0 / (n - 1.0)) * (multivariateData .- meds')' * (multivariateData .- meds')
mah0 = diag(mahalabonisSquaredMatrix(multivariateData, meanvector=meds, covmatrix=Sm))
ordering_indices_mah0 = sortperm(mah0)
best_indices_mah0 = ordering_indices_mah0[1:h]
starting_data = multivariateData[best_indices_mah0, :]

Cv = coordinatwisemedians(starting_data)
Sv = (1.0 / (h - 1.0)) * (starting_data .- Cv')' * (starting_data .- Cv')
mah1 = diag(mahalabonisSquaredMatrix(multivariateData, meanvector=Cv, covmatrix=Sv))
ordering_indices_mah1 = sortperm(mah1)

r = p + 1
basic_subset_indices = []
basic_subset = []
sorted_mah1 = []

while r < n
cnpr = 1 + (r / (n - p))^2.0
basic_subset_indices = ordering_indices_mah1[1:r]
basic_subset = multivariateData[basic_subset_indices, :]
Cb = applyColumns(mean, basic_subset)
Sb = cov(basic_subset)

r += 1
cfactor = cnpr * sqrt(sort(mah1)[h]) / chi_50_quantile
if det(cfactor * Sb) == 0
@info "singular Sb case"
newSb = hadi1992_handle_singularity(cfactor * Sb)
mah1 = diag(mahalabonisSquaredMatrix(multivariateData, meanvector=Cb, covmatrix=newSb))
ordering_indices_mah1 = sortperm(mah1)
basic_subset_indices = ordering_indices_mah1[1:r]
else
mah1 = diag(mahalabonisSquaredMatrix(multivariateData, meanvector=Cb, covmatrix=(cfactor * Sb)))
ordering_indices_mah1 = sortperm(mah1)
basic_subset_indices = ordering_indices_mah1[1:r]
end

sorted_mah1 = sort(mah1)
if sorted_mah1[r] >= critical_quantile
break
end
end

outlierset = setdiff(allindices, basic_subset_indices)

result = Dict()
result["outliers"] = sort(outlierset)
result["criticial.chi.squared"] = critical_quantile
result["rth.robust.distance"] = sorted_mah1[r - 1]
return result
end
49 changes: 47 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,33 @@ using GLM

using LinRegOutliers

@testset "Apply function to columns" begin
ones_vector = ones(Float64, 10)
zeros_vector = zeros(Float64, 10)
mat = hcat(ones_vector, zeros_vector)

@test applyColumns(sum, mat) == [10.0, 0.0]
@test applyColumns(mean, mat) == [1.0, 0.0]
end

@testset "Basis - createRegressionSetting, designMatrix, responseVector" begin
dataset = DataFrame(
x=[1.0, 2, 3, 4, 5],
y=[2.0, 4, 6, 8, 10]
)
setting = createRegressionSetting(@formula(y ~ x), dataset)

@test designMatrix(setting) == hcat(ones(5), dataset[:, "x"])
@test responseVector(setting) == dataset[:, "y"]
@test setting.formula == @formula(y ~ x)
end

@testset "Find nonzero minimum" begin
arr = [0.0, 2.0, 0.0, 0.0, 0.0, 9.0, 1.0]
val = find_minimum_nonzero(arr)
@test val == 1.0
end

@testset "Hadi & Simonoff 1993 - initial subset" begin
# Create simple data
rng = MersenneTwister(12345)
Expand Down Expand Up @@ -150,6 +177,7 @@ end
@test 21 in outset
end


@testset "MVE - Algorithm - Phone data" begin
df = phones
outset = mve(df)["outliers"]
Expand Down Expand Up @@ -194,6 +222,7 @@ end
end
end


@testset "PY95 - Algorithm - Hawkins & Bradu & Kass data" begin
df = hbk
reg = createRegressionSetting(@formula(y ~ x1 + x2 + x3), df)
Expand All @@ -204,7 +233,6 @@ end
end
end


@testset "satman2013 - Algorithm - Hawkins & Bradu & Kass data" begin
df = hbk
reg = createRegressionSetting(@formula(y ~ x1 + x2 + x3), df)
Expand Down Expand Up @@ -265,9 +293,26 @@ end
eps = 0.0001
df = phones
reg = createRegressionSetting(@formula(calls ~ year), df)
result = lta(reg, exact = true)
result = lta(reg, exact=true)
betas = result["betas"]
@test abs(betas[1] - -55.5) < eps
@test abs(betas[2] - 1.15) < eps
end

@testset "Hadi 1992 - Algorithm - with several case" begin
phones_matrix = hcat(phones[:, "calls"], phones[:, "year"])
result = hadi1992(phones_matrix)
outlier_indices = result["outliers"]
for i in 15:20
@test i in outlier_indices
end

hbk_matrix = hcat(hbk[:, "x1"], hbk[:, "x2"], hbk[:, "x3"])
hbk_n, hbk_p = size(hbk_matrix)
result = hadi1992(hbk_matrix)
outlier_indices = result["outliers"]
for i in 15:hbk_n
@test !(i in outlier_indices)
end

end

0 comments on commit feb2991

Please sign in to comment.