Skip to content

Commit

Permalink
new algorithm and datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
jbytecode committed Aug 27, 2020
1 parent 38a30fa commit 99ba2c2
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 7 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
authors = ["Mehmet Hakan Satman <[email protected]>"]
name = "LinRegOutliers"
uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a"
authors = ["Mehmet Hakan Satman <[email protected]>"]
version = "0.2.0"
version = "0.3.0"

[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Expand Down
2 changes: 2 additions & 0 deletions src/LinRegOutliers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ include("mveltsplot.jl")
include("bch.jl")
include("py95.jl")
include("satman2013.jl")
include("satman2015.jl")

# Essentials from other packages
export @formula, DataFrame
Expand Down Expand Up @@ -53,5 +54,6 @@ export mveltsplot
export bch, bchplot
export py95, py95SuspectedObservations
export satman2013
export satman2015

end # module
100 changes: 96 additions & 4 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,106 @@
# Telephone data
phones = DataFrame(
"""
Phone data
# Components
- `year::Integer`: years from 1950 to 1973.
- `calls::Float64`: phone calls (in millions).
# Reference
P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression &
Outlier Detection._ Wiley.
"""
const phones = DataFrame(
year = collect(50:73),
calls = [4.4, 4.7, 4.7, 5.9, 6.6, 7.3, 8.1, 8.8, 10.6, 12.0, 13.5, 14.9, 16.1, 21.2, 119.0, 124.0,
142.0, 159.0, 182.0, 212.0, 43.0, 24.0, 27.0, 29.0]
)

# Hawkins, Bradu, Kass data
hbk = DataFrame(


"""
Hawkins & Bradu & Kass data
# Components
- `x1::Float64`: first independent variable.
- `x2::Float64`: second independent variable.
- `x3::Float64`: third independent variable.
- `y::Float64`: dependent (response) variable.
# Reference
Hawkins, D.M., Bradu, D., and Kass, G.V. (1984) Location of several outliers in multiple regression data using elemental sets. Technometrics 26, 197–208.
"""
const hbk = DataFrame(
x1 = [10.1, 9.5, 10.7, 9.9, 10.3, 10.8, 10.5, 9.9, 9.7, 9.3, 11, 12, 12, 11, 3.4, 3.1, 0, 2.3, 0.8, 3.1, 2.6, 0.4, 2, 1.3, 1, 0.9, 3.3, 1.8, 1.2, 1.2, 3.1, 0.5, 1.5, 0.4, 3.1, 1.1, 0.1, 1.5, 2.1, 0.5, 3.4, 0.3, 0.1, 1.8, 1.9, 1.8, 3, 3.1, 3.1, 2.1, 2.3, 3.3, 0.3, 1.1, 0.5, 1.8, 1.8, 2.4, 1.6, 0.3, 0.4, 0.9, 1.1, 2.8, 2, 0.2, 1.6, 0.1, 2, 1, 2.2, 0.6, 0.3, 0, 0.3],
x2 = [19.6, 20.5, 20.2, 21.5, 21.1, 20.4, 20.9, 19.6, 20.7, 19.7, 24, 23, 26, 34, 2.9, 2.2, 1.6, 1.6, 2.9, 3.4, 2.2, 3.2, 2.3, 2.3, 0, 3.3, 2.5, 0.8, 0.9, 0.7, 1.4, 2.4, 3.1, 0, 2.4, 2.2, 3, 1.2, 0, 2, 1.6, 1, 3.3, 0.5, 0.1, 0.5, 0.1, 1.6, 2.5, 2.8, 1.5, 0.6, 0.4, 3, 2.4, 3.2, 0.7, 3.4, 2.1, 1.5, 3.4, 0.1, 2.7, 3, 0.7, 1.8, 2, 0, 0.6, 2.2, 2.5, 2, 1.7, 2.2, 0.4],
x3 = [28.3, 28.9, 31, 31.7, 31.1, 29.2, 29.1, 28.8, 31, 30.3, 35, 37, 34, 34, 2.1, 0.3, 0.2, 2, 1.6, 2.2, 1.9, 1.9, 0.8, 0.5, 0.4, 2.5, 2.9, 2, 0.8, 3.4, 1, 0.3, 1.5, 0.7, 3, 2.7, 2.6, 0.2, 1.2, 1.2, 2.9, 2.7, 0.9, 3.2, 0.6, 3, 0.8, 3, 1.9, 2.9, 0.4, 1.2, 3.3, 0.3, 0.9, 0.9, 0.7, 1.5, 3, 3.3, 3, 0.3, 0.2, 2.9, 2.7, 0.8, 1.2, 1.1, 0.3, 2.9, 2.3, 1.5, 2.2, 1.6, 2.6],
y = [9.7, 10.1, 10.3, 9.5, 10, 10, 10.8, 10.3, 9.6, 9.9, -0.2, -0.4, 0.7, 0.1, -0.4, 0.6, -0.2, 0, 0.1, 0.4, 0.9, 0.3, -0.8, 0.7, -0.3, -0.8, -0.7, 0.3, 0.3, -0.3, 0, -0.4, -0.6, -0.7, 0.3, -1, -0.6, 0.9, -0.7, -0.5, -0.1, -0.7, 0.6, -0.7, -0.5, -0.4, -0.9, 0.1, 0.9, -0.4, 0.7, -0.5, 0.7, 0.7, 0, 0.1, 0.7, -0.1, -0.3, -0.9, -0.3, 0.6, -0.3, -0.5, 0.6, -0.9, -0.7, 0.6, 0.2, 0.7, 0.2, -0.2, 0.4, -0.9, 0.2]
)



"""
Animals data
# Components
- `names::AbstractString`: names of animals.
- `body::Float64`: body weight in kg.
- `brain::Float64`: brain weight in g.
# References
Venables, W. N. and Ripley, B. D. (1999) _Modern Applied
Statistics with S-PLUS._ Third Edition. Springer.
P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression and
Outlier Detection._ Wiley, p. 57.
"""
const animals = DataFrame(
names = ["Mountain beaver", "Cow", "Grey wolf", "Goat", "Guinea pig", "Dipliodocus", "Asian elephant", "Donkey", "Horse", "Potar monkey", "Cat", "Giraffe", "Gorilla", "Human", "African elephant", "Triceratops", "Rhesus monkey", "Kangaroo", "Golden hamster", "Mouse", "Rabbit", "Sheep", "Jaguar", "Chimpanzee", "Rat", "Brachiosaurus", "Mole", "Pig"],
body = [1.35, 465, 36.33, 27.66, 1.04, 11700, 2547, 187.1, 521, 10, 3.3, 529, 207, 62, 6654, 9400, 6.8, 35, 0.12, 0.023, 2.5, 55.5, 100, 52.16, 0.28, 87000, 0.122, 192],
brain = [8.1, 423, 119.5, 115, 5.5, 50, 4603, 419, 655, 115, 25.6, 680, 406, 1320, 5712, 70, 179, 56, 1, 0.4, 12.1, 175, 157, 440, 1.9, 154.5, 3, 180]
)



"""
Weight loss data
# Components
- `days::Integer`: time in days since the start of the diet program.
- `weight::Float64`: weight in kg.
# Reference
Venables, W. N. and Ripley, B. D. (1999) _Modern Applied
Statistics with S-PLUS._ Third Edition. Springer.
"""
const weightloss = DataFrame(
days = [0, 4, 7, 7, 11, 18, 24, 30, 32, 43, 46, 60, 64, 70, 71, 71, 73, 74, 84, 88, 95, 102, 106, 109, 115, 122, 133, 137, 140, 143, 147, 148, 149, 150, 153, 156, 161, 164, 165, 165, 170, 176, 179, 198, 214, 218, 221, 225, 233, 238, 241, 246],
weight = [184.35, 182.51, 180.45, 179.91, 177.91, 175.81, 173.11, 170.06, 169.31, 165.1, 163.11, 158.3, 155.8, 154.31, 153.86, 154.2, 152.2, 152.8, 150.3, 147.8, 146.1, 145.6, 142.5, 142.3, 139.4, 137.9, 133.7, 133.7, 133.3, 131.2, 133, 132.2, 130.8, 131.3, 129, 127.9, 126.9, 127.7, 129.5, 128.4, 125.4, 124.9, 124.9, 118.2, 118.2, 115.3, 115.7, 116, 115.5, 112.6, 114, 112.6]
)


"""
Stack loss data
# Components
- `airflow::Float64`: flow of cooling air (independent variable).
- `watertemp::Float64`: cooling water inlet temperature (independent variable).
- `acidcond::Float64`: concentration of acid (independent variable).
- `stackloss::Float64`: stack loss (dependent variable).
# References
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S Language_. Wadsworth & Brooks/Cole.
Dodge, Y. (1996) The guinea pig of multiple regression. In: _Robust Statistics, Data Analysis, and Computer Intensive Methods;
In Honor of Peter Huber's 60th Birthday_, 1996, _Lecture Notes in Statistics_ *109*, Springer-Verlag, New York.
"""
const stackloss = DataFrame(
airflow = [80.0, 80, 75, 62, 62, 62, 62, 62, 58, 58, 58, 58, 58, 58, 50, 50, 50, 50, 50, 56, 70],
watertemp = [27.0, 27, 25, 24, 22, 23, 24, 24, 23, 18, 18, 17, 18, 19, 18, 18, 19, 19, 20, 20, 20],
acidcond = [89.0, 88, 90, 87, 87, 87, 93, 93, 87, 80, 89, 88, 82, 93, 89, 86, 72, 79, 80, 82, 91],
stackloss = [42.0, 37, 37, 28, 18, 18, 19, 20, 15, 14, 14, 13, 11, 12, 8, 7, 8, 8, 9, 15, 15]
)
80 changes: 80 additions & 0 deletions src/satman2015.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
function dominates(p1::Array, p2::Array)::Bool
n = length(p1)
notworse = count(i -> p1[i] < p2[i], 1:n)
better = count(i -> p1[i] > p2[i], 1:n)
return (notworse == 0) && (better > 0)
end

function ndsranks(data::DataFrame)::Array{Int}
mat = convert(Matrix, data)
return ndsranks(mat)
end

function ndsranks(data::Matrix)::Array{Int}
n, p = size(data)
ranks = zeros(Int, n)
mat = convert(Matrix, data)
for i in 1:n
for j in 1:n
if i != j
if dominates(mat[i,:], mat[j,:])
ranks[i] += 1
end
end
end
end
return ranks
end

function midlist(n::Int, p::Int)::Array{Int, 1}
midlist = []
if (n - p) % 2 == 0
start = ((n - p) / 2) + 1
stop = start + p - 1
midlist = collect(start:stop)
else
start = Int(floor((n - p) / 2)) + 1
stop = start + p
midlist = collect(start:stop)
end
return midlist
end

function satman2015(setting::RegressionSetting)
X = designMatrix(setting)
Y = responseVector(setting)
n, p = size(X)
h = Int(floor((n + p + 1.0) / 2.0))

allindices = collect(1:n)

ranks = ndsranks(X)
ranks_ordering = sortperm(ranks)

basic_center_indices = midlist(n, p)
basic_subset_indices = ranks_ordering[basic_center_indices]

meanvector = applyColumns(mean, X[basic_subset_indices,:])
covmat = cov(X[basic_subset_indices,:])
md2 = diag(mahalabonisSquaredMatrix(X, meanvector = meanvector, covmatrix = covmat))
md = sqrt.(md2)
sorted_indices = sortperm(md)
best_h_indices = sorted_indices[1:h]

crit, bestset = iterateCSteps(setting, best_h_indices, h)

regy = Y[bestset]
regx = X[bestset,:]
ols = lm(setting.formula, setting.data[bestset, :])
betas = coef(ols)
resids = Y .- (X * betas)
med_res = median(resids)
standardized_resids = (resids .- med_res) / median(abs.(resids .- med_res))

outlierset = filter(i -> abs(standardized_resids[i]) > 2.5, allindices)

result = Dict()
result["outliers"] = outlierset

return result
end
16 changes: 15 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ end
end


@testset "Kianifard & Shallow 1989 - Algorithm" begin
@testset "Kianifard & Swallow 1989 - Algorithm" begin
df = phones
reg = createRegressionSetting(@formula(calls ~ year), df)
outset = ks89(reg)
Expand Down Expand Up @@ -215,3 +215,17 @@ end
end
end

@testset "Satman2015 - Algorithm - Phone data" begin
@test LinRegOutliers.dominates([1,2,4], [1,2,3])
@test LinRegOutliers.dominates([1,2,3], [0,1,2])
@test !LinRegOutliers.dominates([1,2,3], [1,2,3])
@test !LinRegOutliers.dominates([1,2,3], [1,2,4])

df = phones
reg = createRegressionSetting(@formula(calls ~ year), df)
result = satman2015(reg)
outliers = result["outliers"]
for i in 15:20
@test i in outliers
end
end

0 comments on commit 99ba2c2

Please sign in to comment.