new algorithm and datasets

jbytecode · Aug 27, 2020 · 99ba2c2 · 99ba2c2
1 parent 38a30fa
commit 99ba2c2
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 7 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
+authors = ["Mehmet Hakan Satman <[email protected]>"]
 name = "LinRegOutliers"
 uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a"
-authors = ["Mehmet Hakan Satman <[email protected]>"]
-version = "0.2.0"
+version = "0.3.0"
 
 [deps]
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"

diff --git a/src/LinRegOutliers.jl b/src/LinRegOutliers.jl
@@ -21,6 +21,7 @@ include("mveltsplot.jl")
 include("bch.jl")
 include("py95.jl")
 include("satman2013.jl")
+include("satman2015.jl")
 
 # Essentials from other packages
 export @formula, DataFrame
@@ -53,5 +54,6 @@ export mveltsplot
 export bch, bchplot
 export py95, py95SuspectedObservations
 export satman2013
+export satman2015
 
 end # module
diff --git a/src/data.jl b/src/data.jl
@@ -1,14 +1,106 @@
-# Telephone data
-phones = DataFrame(
+"""
+Phone data
+
+# Components
+- `year::Integer`: years from 1950 to 1973.
+- `calls::Float64`: phone calls (in millions).
+
+# Reference
+P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression &
+     Outlier Detection._ Wiley.
+"""
+const phones = DataFrame(
  year = collect(50:73),
  calls = [4.4, 4.7, 4.7, 5.9, 6.6, 7.3, 8.1, 8.8, 10.6, 12.0, 13.5, 14.9, 16.1, 21.2, 119.0, 124.0,
           142.0, 159.0, 182.0, 212.0, 43.0, 24.0, 27.0, 29.0]
 )
 
-# Hawkins, Bradu, Kass data
-hbk = DataFrame(
+
+
+"""
+Hawkins & Bradu & Kass data
+
+# Components
+- `x1::Float64`: first independent variable.
+- `x2::Float64`: second independent variable.
+- `x3::Float64`: third independent variable.
+- `y::Float64`: dependent (response) variable.
+
+
+# Reference
+Hawkins, D.M., Bradu, D., and Kass, G.V. (1984) Location of several outliers in multiple regression data using elemental sets. Technometrics 26, 197–208.
+"""
+const hbk = DataFrame(
     x1 = [10.1, 9.5, 10.7, 9.9, 10.3, 10.8, 10.5, 9.9, 9.7, 9.3, 11, 12, 12, 11, 3.4, 3.1, 0, 2.3, 0.8, 3.1, 2.6, 0.4, 2, 1.3, 1, 0.9, 3.3, 1.8, 1.2, 1.2, 3.1, 0.5, 1.5, 0.4, 3.1, 1.1, 0.1, 1.5, 2.1, 0.5, 3.4, 0.3, 0.1, 1.8, 1.9, 1.8, 3, 3.1, 3.1, 2.1, 2.3, 3.3, 0.3, 1.1, 0.5, 1.8, 1.8, 2.4, 1.6, 0.3, 0.4, 0.9, 1.1, 2.8, 2, 0.2, 1.6, 0.1, 2, 1, 2.2, 0.6, 0.3, 0, 0.3],
     x2 = [19.6, 20.5, 20.2, 21.5, 21.1, 20.4, 20.9, 19.6, 20.7, 19.7, 24, 23, 26, 34, 2.9, 2.2, 1.6, 1.6, 2.9, 3.4, 2.2, 3.2, 2.3, 2.3, 0, 3.3, 2.5, 0.8, 0.9, 0.7, 1.4, 2.4, 3.1, 0, 2.4, 2.2, 3, 1.2, 0, 2, 1.6, 1, 3.3, 0.5, 0.1, 0.5, 0.1, 1.6, 2.5, 2.8, 1.5, 0.6, 0.4, 3, 2.4, 3.2, 0.7, 3.4, 2.1, 1.5, 3.4, 0.1, 2.7, 3, 0.7, 1.8, 2, 0, 0.6, 2.2, 2.5, 2, 1.7, 2.2, 0.4],
     x3 = [28.3, 28.9, 31, 31.7, 31.1, 29.2, 29.1, 28.8, 31, 30.3, 35, 37, 34, 34, 2.1, 0.3, 0.2, 2, 1.6, 2.2, 1.9, 1.9, 0.8, 0.5, 0.4, 2.5, 2.9, 2, 0.8, 3.4, 1, 0.3, 1.5, 0.7, 3, 2.7, 2.6, 0.2, 1.2, 1.2, 2.9, 2.7, 0.9, 3.2, 0.6, 3, 0.8, 3, 1.9, 2.9, 0.4, 1.2, 3.3, 0.3, 0.9, 0.9, 0.7, 1.5, 3, 3.3, 3, 0.3, 0.2, 2.9, 2.7, 0.8, 1.2, 1.1, 0.3, 2.9, 2.3, 1.5, 2.2, 1.6, 2.6],
     y  = [9.7, 10.1, 10.3, 9.5, 10, 10, 10.8, 10.3, 9.6, 9.9, -0.2, -0.4, 0.7, 0.1, -0.4, 0.6, -0.2, 0, 0.1, 0.4, 0.9, 0.3, -0.8, 0.7, -0.3, -0.8, -0.7, 0.3, 0.3, -0.3, 0, -0.4, -0.6, -0.7, 0.3, -1, -0.6, 0.9, -0.7, -0.5, -0.1, -0.7, 0.6, -0.7, -0.5, -0.4, -0.9, 0.1, 0.9, -0.4, 0.7, -0.5, 0.7, 0.7, 0, 0.1, 0.7, -0.1, -0.3, -0.9, -0.3, 0.6, -0.3, -0.5, 0.6, -0.9, -0.7, 0.6, 0.2, 0.7, 0.2, -0.2, 0.4, -0.9, 0.2]
 )
+
+
+
+"""
+Animals data
+
+# Components
+- `names::AbstractString`: names of animals.
+- `body::Float64`: body weight in kg.
+- `brain::Float64`: brain weight in g.
+
+
+# References
+     Venables, W. N. and Ripley, B. D. (1999) _Modern Applied
+     Statistics with S-PLUS._ Third Edition. Springer.
+
+     P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression and
+     Outlier Detection._ Wiley, p. 57.
+"""
+const animals = DataFrame(
+    names = ["Mountain beaver", "Cow", "Grey wolf", "Goat", "Guinea pig", "Dipliodocus", "Asian elephant", "Donkey", "Horse", "Potar monkey", "Cat", "Giraffe", "Gorilla", "Human", "African elephant", "Triceratops", "Rhesus monkey", "Kangaroo", "Golden hamster", "Mouse", "Rabbit", "Sheep", "Jaguar", "Chimpanzee", "Rat", "Brachiosaurus", "Mole", "Pig"],
+    body = [1.35, 465, 36.33, 27.66, 1.04, 11700, 2547, 187.1, 521, 10, 3.3, 529, 207, 62, 6654, 9400, 6.8, 35, 0.12, 0.023, 2.5, 55.5, 100, 52.16, 0.28, 87000, 0.122, 192],
+    brain = [8.1, 423, 119.5, 115, 5.5, 50, 4603, 419, 655, 115, 25.6, 680, 406, 1320, 5712, 70, 179, 56, 1, 0.4, 12.1, 175, 157, 440, 1.9, 154.5, 3, 180]
+)
+
+
+
+"""
+Weight loss data
+
+# Components
+- `days::Integer`: time in days since the start of the diet program.
+- `weight::Float64`: weight in kg.
+
+
+# Reference
+     Venables, W. N. and Ripley, B. D. (1999) _Modern Applied
+     Statistics with S-PLUS._ Third Edition. Springer.
+"""
+const weightloss = DataFrame(
+    days = [0, 4, 7, 7, 11, 18, 24, 30, 32, 43, 46, 60, 64, 70, 71, 71, 73, 74, 84, 88, 95, 102, 106, 109, 115, 122, 133, 137, 140, 143, 147, 148, 149, 150, 153, 156, 161, 164, 165, 165, 170, 176, 179, 198, 214, 218, 221, 225, 233, 238, 241, 246],
+    weight = [184.35, 182.51, 180.45, 179.91, 177.91, 175.81, 173.11, 170.06, 169.31, 165.1, 163.11, 158.3, 155.8, 154.31, 153.86, 154.2, 152.2, 152.8, 150.3, 147.8, 146.1, 145.6, 142.5, 142.3, 139.4, 137.9, 133.7, 133.7, 133.3, 131.2, 133, 132.2, 130.8, 131.3, 129, 127.9, 126.9, 127.7, 129.5, 128.4, 125.4, 124.9, 124.9, 118.2, 118.2, 115.3, 115.7, 116, 115.5, 112.6, 114, 112.6]
+)
+
+
+"""
+Stack loss data
+
+# Components
+- `airflow::Float64`: flow of cooling air (independent variable).
+- `watertemp::Float64`: cooling water inlet temperature (independent variable).
+- `acidcond::Float64`: concentration of acid (independent variable).
+- `stackloss::Float64`: stack loss (dependent variable).
+
+
+
+# References
+    Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S Language_.  Wadsworth & Brooks/Cole.
+
+    Dodge, Y. (1996) The guinea pig of multiple regression. In: _Robust Statistics, Data Analysis, and Computer Intensive Methods;
+    In Honor of Peter Huber's 60th Birthday_, 1996, _Lecture Notes in Statistics_ *109*, Springer-Verlag, New York.
+"""
+const stackloss = DataFrame(
+    airflow = [80.0, 80, 75, 62, 62, 62, 62, 62, 58, 58, 58, 58, 58, 58, 50, 50, 50, 50, 50, 56, 70],
+    watertemp = [27.0, 27, 25, 24, 22, 23, 24, 24, 23, 18, 18, 17, 18, 19, 18, 18, 19, 19, 20, 20, 20],
+    acidcond = [89.0, 88, 90, 87, 87, 87, 93, 93, 87, 80, 89, 88, 82, 93, 89, 86, 72, 79, 80, 82, 91],
+    stackloss = [42.0, 37, 37, 28, 18, 18, 19, 20, 15, 14, 14, 13, 11, 12, 8, 7, 8, 8, 9, 15, 15]
+)
diff --git a/src/satman2015.jl b/src/satman2015.jl
@@ -0,0 +1,80 @@
+function dominates(p1::Array, p2::Array)::Bool
+    n = length(p1)
+    notworse = count(i -> p1[i] < p2[i], 1:n)
+    better   = count(i -> p1[i] > p2[i], 1:n)
+    return (notworse == 0) && (better > 0)
+end
+
+function ndsranks(data::DataFrame)::Array{Int}
+    mat = convert(Matrix, data)
+    return ndsranks(mat)
+end
+
+function ndsranks(data::Matrix)::Array{Int}
+    n, p = size(data)
+    ranks = zeros(Int, n)
+    mat = convert(Matrix, data)
+    for i in 1:n
+        for j in 1:n
+            if i != j 
+                if dominates(mat[i,:], mat[j,:])
+                    ranks[i] += 1
+                end
+            end
+        end
+    end
+    return ranks
+end
+
+function midlist(n::Int, p::Int)::Array{Int, 1}
+    midlist = []
+    if (n - p) % 2 == 0
+        start = ((n - p) / 2) + 1
+        stop = start + p - 1
+        midlist = collect(start:stop)
+    else
+        start = Int(floor((n - p) / 2)) + 1
+        stop = start + p  
+        midlist = collect(start:stop)
+    end 
+    return midlist
+end
+
+function satman2015(setting::RegressionSetting)
+    X = designMatrix(setting)
+    Y = responseVector(setting)
+    n, p = size(X)
+    h = Int(floor((n + p + 1.0) / 2.0))
+
+    allindices = collect(1:n)
+
+    ranks = ndsranks(X)
+    ranks_ordering = sortperm(ranks)
+
+    basic_center_indices = midlist(n, p)
+    basic_subset_indices = ranks_ordering[basic_center_indices]
+
+    meanvector = applyColumns(mean, X[basic_subset_indices,:])
+    covmat = cov(X[basic_subset_indices,:])
+    md2 = diag(mahalabonisSquaredMatrix(X, meanvector = meanvector, covmatrix = covmat))
+    md = sqrt.(md2)
+    sorted_indices = sortperm(md)
+    best_h_indices = sorted_indices[1:h]
+
+    crit, bestset = iterateCSteps(setting, best_h_indices, h)
+
+    regy = Y[bestset]
+    regx = X[bestset,:]
+    ols = lm(setting.formula, setting.data[bestset, :])
+    betas = coef(ols)
+    resids = Y .- (X * betas)
+    med_res = median(resids)
+    standardized_resids = (resids .- med_res) / median(abs.(resids .- med_res))
+
+    outlierset = filter(i -> abs(standardized_resids[i]) > 2.5, allindices)
+
+    result = Dict()
+    result["outliers"] = outlierset
+
+    return result
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -53,7 +53,7 @@ end
 end
 
 
-@testset "Kianifard & Shallow 1989 - Algorithm" begin
+@testset "Kianifard & Swallow 1989 - Algorithm" begin
     df = phones
     reg = createRegressionSetting(@formula(calls ~ year), df)
     outset = ks89(reg)
@@ -215,3 +215,17 @@ end
     end
 end
 
+@testset "Satman2015 - Algorithm - Phone data" begin
+    @test LinRegOutliers.dominates([1,2,4], [1,2,3])
+    @test LinRegOutliers.dominates([1,2,3], [0,1,2])
+    @test !LinRegOutliers.dominates([1,2,3], [1,2,3])
+    @test !LinRegOutliers.dominates([1,2,3], [1,2,4])
+
+    df = phones
+    reg = createRegressionSetting(@formula(calls ~ year), df)
+    result = satman2015(reg)
+    outliers = result["outliers"]
+    for i in 15:20
+        @test i in outliers 
+    end
+end