-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
195 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
authors = ["Mehmet Hakan Satman <[email protected]>"] | ||
name = "LinRegOutliers" | ||
uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a" | ||
authors = ["Mehmet Hakan Satman <[email protected]>"] | ||
version = "0.2.0" | ||
version = "0.3.0" | ||
|
||
[deps] | ||
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,106 @@ | ||
# Telephone data | ||
phones = DataFrame( | ||
""" | ||
Phone data | ||
# Components | ||
- `year::Integer`: years from 1950 to 1973. | ||
- `calls::Float64`: phone calls (in millions). | ||
# Reference | ||
P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression & | ||
Outlier Detection._ Wiley. | ||
""" | ||
const phones = DataFrame( | ||
year = collect(50:73), | ||
calls = [4.4, 4.7, 4.7, 5.9, 6.6, 7.3, 8.1, 8.8, 10.6, 12.0, 13.5, 14.9, 16.1, 21.2, 119.0, 124.0, | ||
142.0, 159.0, 182.0, 212.0, 43.0, 24.0, 27.0, 29.0] | ||
) | ||
|
||
# Hawkins, Bradu, Kass data | ||
hbk = DataFrame( | ||
|
||
|
||
""" | ||
Hawkins & Bradu & Kass data | ||
# Components | ||
- `x1::Float64`: first independent variable. | ||
- `x2::Float64`: second independent variable. | ||
- `x3::Float64`: third independent variable. | ||
- `y::Float64`: dependent (response) variable. | ||
# Reference | ||
Hawkins, D.M., Bradu, D., and Kass, G.V. (1984) Location of several outliers in multiple regression data using elemental sets. Technometrics 26, 197–208. | ||
""" | ||
const hbk = DataFrame( | ||
x1 = [10.1, 9.5, 10.7, 9.9, 10.3, 10.8, 10.5, 9.9, 9.7, 9.3, 11, 12, 12, 11, 3.4, 3.1, 0, 2.3, 0.8, 3.1, 2.6, 0.4, 2, 1.3, 1, 0.9, 3.3, 1.8, 1.2, 1.2, 3.1, 0.5, 1.5, 0.4, 3.1, 1.1, 0.1, 1.5, 2.1, 0.5, 3.4, 0.3, 0.1, 1.8, 1.9, 1.8, 3, 3.1, 3.1, 2.1, 2.3, 3.3, 0.3, 1.1, 0.5, 1.8, 1.8, 2.4, 1.6, 0.3, 0.4, 0.9, 1.1, 2.8, 2, 0.2, 1.6, 0.1, 2, 1, 2.2, 0.6, 0.3, 0, 0.3], | ||
x2 = [19.6, 20.5, 20.2, 21.5, 21.1, 20.4, 20.9, 19.6, 20.7, 19.7, 24, 23, 26, 34, 2.9, 2.2, 1.6, 1.6, 2.9, 3.4, 2.2, 3.2, 2.3, 2.3, 0, 3.3, 2.5, 0.8, 0.9, 0.7, 1.4, 2.4, 3.1, 0, 2.4, 2.2, 3, 1.2, 0, 2, 1.6, 1, 3.3, 0.5, 0.1, 0.5, 0.1, 1.6, 2.5, 2.8, 1.5, 0.6, 0.4, 3, 2.4, 3.2, 0.7, 3.4, 2.1, 1.5, 3.4, 0.1, 2.7, 3, 0.7, 1.8, 2, 0, 0.6, 2.2, 2.5, 2, 1.7, 2.2, 0.4], | ||
x3 = [28.3, 28.9, 31, 31.7, 31.1, 29.2, 29.1, 28.8, 31, 30.3, 35, 37, 34, 34, 2.1, 0.3, 0.2, 2, 1.6, 2.2, 1.9, 1.9, 0.8, 0.5, 0.4, 2.5, 2.9, 2, 0.8, 3.4, 1, 0.3, 1.5, 0.7, 3, 2.7, 2.6, 0.2, 1.2, 1.2, 2.9, 2.7, 0.9, 3.2, 0.6, 3, 0.8, 3, 1.9, 2.9, 0.4, 1.2, 3.3, 0.3, 0.9, 0.9, 0.7, 1.5, 3, 3.3, 3, 0.3, 0.2, 2.9, 2.7, 0.8, 1.2, 1.1, 0.3, 2.9, 2.3, 1.5, 2.2, 1.6, 2.6], | ||
y = [9.7, 10.1, 10.3, 9.5, 10, 10, 10.8, 10.3, 9.6, 9.9, -0.2, -0.4, 0.7, 0.1, -0.4, 0.6, -0.2, 0, 0.1, 0.4, 0.9, 0.3, -0.8, 0.7, -0.3, -0.8, -0.7, 0.3, 0.3, -0.3, 0, -0.4, -0.6, -0.7, 0.3, -1, -0.6, 0.9, -0.7, -0.5, -0.1, -0.7, 0.6, -0.7, -0.5, -0.4, -0.9, 0.1, 0.9, -0.4, 0.7, -0.5, 0.7, 0.7, 0, 0.1, 0.7, -0.1, -0.3, -0.9, -0.3, 0.6, -0.3, -0.5, 0.6, -0.9, -0.7, 0.6, 0.2, 0.7, 0.2, -0.2, 0.4, -0.9, 0.2] | ||
) | ||
|
||
|
||
|
||
""" | ||
Animals data | ||
# Components | ||
- `names::AbstractString`: names of animals. | ||
- `body::Float64`: body weight in kg. | ||
- `brain::Float64`: brain weight in g. | ||
# References | ||
Venables, W. N. and Ripley, B. D. (1999) _Modern Applied | ||
Statistics with S-PLUS._ Third Edition. Springer. | ||
P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression and | ||
Outlier Detection._ Wiley, p. 57. | ||
""" | ||
const animals = DataFrame( | ||
names = ["Mountain beaver", "Cow", "Grey wolf", "Goat", "Guinea pig", "Dipliodocus", "Asian elephant", "Donkey", "Horse", "Potar monkey", "Cat", "Giraffe", "Gorilla", "Human", "African elephant", "Triceratops", "Rhesus monkey", "Kangaroo", "Golden hamster", "Mouse", "Rabbit", "Sheep", "Jaguar", "Chimpanzee", "Rat", "Brachiosaurus", "Mole", "Pig"], | ||
body = [1.35, 465, 36.33, 27.66, 1.04, 11700, 2547, 187.1, 521, 10, 3.3, 529, 207, 62, 6654, 9400, 6.8, 35, 0.12, 0.023, 2.5, 55.5, 100, 52.16, 0.28, 87000, 0.122, 192], | ||
brain = [8.1, 423, 119.5, 115, 5.5, 50, 4603, 419, 655, 115, 25.6, 680, 406, 1320, 5712, 70, 179, 56, 1, 0.4, 12.1, 175, 157, 440, 1.9, 154.5, 3, 180] | ||
) | ||
|
||
|
||
|
||
""" | ||
Weight loss data | ||
# Components | ||
- `days::Integer`: time in days since the start of the diet program. | ||
- `weight::Float64`: weight in kg. | ||
# Reference | ||
Venables, W. N. and Ripley, B. D. (1999) _Modern Applied | ||
Statistics with S-PLUS._ Third Edition. Springer. | ||
""" | ||
const weightloss = DataFrame( | ||
days = [0, 4, 7, 7, 11, 18, 24, 30, 32, 43, 46, 60, 64, 70, 71, 71, 73, 74, 84, 88, 95, 102, 106, 109, 115, 122, 133, 137, 140, 143, 147, 148, 149, 150, 153, 156, 161, 164, 165, 165, 170, 176, 179, 198, 214, 218, 221, 225, 233, 238, 241, 246], | ||
weight = [184.35, 182.51, 180.45, 179.91, 177.91, 175.81, 173.11, 170.06, 169.31, 165.1, 163.11, 158.3, 155.8, 154.31, 153.86, 154.2, 152.2, 152.8, 150.3, 147.8, 146.1, 145.6, 142.5, 142.3, 139.4, 137.9, 133.7, 133.7, 133.3, 131.2, 133, 132.2, 130.8, 131.3, 129, 127.9, 126.9, 127.7, 129.5, 128.4, 125.4, 124.9, 124.9, 118.2, 118.2, 115.3, 115.7, 116, 115.5, 112.6, 114, 112.6] | ||
) | ||
|
||
|
||
""" | ||
Stack loss data | ||
# Components | ||
- `airflow::Float64`: flow of cooling air (independent variable). | ||
- `watertemp::Float64`: cooling water inlet temperature (independent variable). | ||
- `acidcond::Float64`: concentration of acid (independent variable). | ||
- `stackloss::Float64`: stack loss (dependent variable). | ||
# References | ||
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S Language_. Wadsworth & Brooks/Cole. | ||
Dodge, Y. (1996) The guinea pig of multiple regression. In: _Robust Statistics, Data Analysis, and Computer Intensive Methods; | ||
In Honor of Peter Huber's 60th Birthday_, 1996, _Lecture Notes in Statistics_ *109*, Springer-Verlag, New York. | ||
""" | ||
const stackloss = DataFrame( | ||
airflow = [80.0, 80, 75, 62, 62, 62, 62, 62, 58, 58, 58, 58, 58, 58, 50, 50, 50, 50, 50, 56, 70], | ||
watertemp = [27.0, 27, 25, 24, 22, 23, 24, 24, 23, 18, 18, 17, 18, 19, 18, 18, 19, 19, 20, 20, 20], | ||
acidcond = [89.0, 88, 90, 87, 87, 87, 93, 93, 87, 80, 89, 88, 82, 93, 89, 86, 72, 79, 80, 82, 91], | ||
stackloss = [42.0, 37, 37, 28, 18, 18, 19, 20, 15, 14, 14, 13, 11, 12, 8, 7, 8, 8, 9, 15, 15] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
function dominates(p1::Array, p2::Array)::Bool | ||
n = length(p1) | ||
notworse = count(i -> p1[i] < p2[i], 1:n) | ||
better = count(i -> p1[i] > p2[i], 1:n) | ||
return (notworse == 0) && (better > 0) | ||
end | ||
|
||
function ndsranks(data::DataFrame)::Array{Int} | ||
mat = convert(Matrix, data) | ||
return ndsranks(mat) | ||
end | ||
|
||
function ndsranks(data::Matrix)::Array{Int} | ||
n, p = size(data) | ||
ranks = zeros(Int, n) | ||
mat = convert(Matrix, data) | ||
for i in 1:n | ||
for j in 1:n | ||
if i != j | ||
if dominates(mat[i,:], mat[j,:]) | ||
ranks[i] += 1 | ||
end | ||
end | ||
end | ||
end | ||
return ranks | ||
end | ||
|
||
function midlist(n::Int, p::Int)::Array{Int, 1} | ||
midlist = [] | ||
if (n - p) % 2 == 0 | ||
start = ((n - p) / 2) + 1 | ||
stop = start + p - 1 | ||
midlist = collect(start:stop) | ||
else | ||
start = Int(floor((n - p) / 2)) + 1 | ||
stop = start + p | ||
midlist = collect(start:stop) | ||
end | ||
return midlist | ||
end | ||
|
||
function satman2015(setting::RegressionSetting) | ||
X = designMatrix(setting) | ||
Y = responseVector(setting) | ||
n, p = size(X) | ||
h = Int(floor((n + p + 1.0) / 2.0)) | ||
|
||
allindices = collect(1:n) | ||
|
||
ranks = ndsranks(X) | ||
ranks_ordering = sortperm(ranks) | ||
|
||
basic_center_indices = midlist(n, p) | ||
basic_subset_indices = ranks_ordering[basic_center_indices] | ||
|
||
meanvector = applyColumns(mean, X[basic_subset_indices,:]) | ||
covmat = cov(X[basic_subset_indices,:]) | ||
md2 = diag(mahalabonisSquaredMatrix(X, meanvector = meanvector, covmatrix = covmat)) | ||
md = sqrt.(md2) | ||
sorted_indices = sortperm(md) | ||
best_h_indices = sorted_indices[1:h] | ||
|
||
crit, bestset = iterateCSteps(setting, best_h_indices, h) | ||
|
||
regy = Y[bestset] | ||
regx = X[bestset,:] | ||
ols = lm(setting.formula, setting.data[bestset, :]) | ||
betas = coef(ols) | ||
resids = Y .- (X * betas) | ||
med_res = median(resids) | ||
standardized_resids = (resids .- med_res) / median(abs.(resids .- med_res)) | ||
|
||
outlierset = filter(i -> abs(standardized_resids[i]) > 2.5, allindices) | ||
|
||
result = Dict() | ||
result["outliers"] = outlierset | ||
|
||
return result | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters