-
Notifications
You must be signed in to change notification settings - Fork 65
/
losses.jl
676 lines (597 loc) · 24.3 KB
/
losses.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
# Predefined loss functions
# You may also implement your own loss by subtyping the abstract type Loss.
#
# Losses must have the following:
# Fields:
# `scale::Float64`
# This field represents a scalar weight assigned to the loss function: w*l(u,a)
# `domain::natural_Domain`
# The "natural" domain that the loss function was meant to handle. E.g. BoolDomain for LogisticLoss,
# RealDomain for QuadLoss, etc.
# Other fields may be also be included to encode parameters of the loss function, encode the range or
# set of possible values of the data, etc.
#
# Methods:
# `my_loss_type(args..., scale=1.0::Float64;
# domain=natural_Domain(args[range]...), kwargs...) ::my_loss_type`
# Constructor for the loss type. The first few arguments are parameters for
# which there isn't a rational default (a loss may not need any of these).
# The last positional argument should be the scale, which should default to 1.
# There must be a default domain which is a Domain, which may take arguments from
# the list of positional arguments. Parameters besides the scale for which there are
# reasonable defaults should be included as keyword arguments (there may be none).
# `evaluate(l::my_loss_type, u::Float64, a::Number) ::Float64`
# Evaluates the function l(u,a) where u is the approximation of a
# `grad(l::my_loss_type, u::Float64, a::Number) ::Float64`
# Evaluates the gradient of the loss at the given point (u,a)
# In addition, loss functions should preferably implement methods:
# `M_estimator(l::my_loss_type, a::AbstractArray) ::Float64`
# Finds uₒ = argmin ∑l(u,aᵢ) which is the best single estimate of the array `a`
# If `M_estimator` is not implemented, a live optimization procedure will be used when this function is
# called in order to compute loss function scalings. The live optimization may be slow, so an analytic
# implementation is preferable.
# `impute(d::Domain, l::my_loss_type, u::Array{Float64})` (in impute_and_err.jl)
# Finds a = argmin l(u,a), the most likely value for an observation given a parameter u
import Base: *, convert
import Optim: optimize, LBFGS
export Loss,
DiffLoss, ClassificationLoss, SingleDimLoss, # categories of Losses
QuadLoss, L1Loss, HuberLoss, QuantileLoss, # losses for predicting reals
PoissonLoss, # losses for predicting integers
HingeLoss, WeightedHingeLoss, LogisticLoss, # losses for predicting booleans
OrdinalHingeLoss, OrdisticLoss, MultinomialOrdinalLoss, BvSLoss, # losses for predicting ordinals
MultinomialLoss, OvALoss, # losses for predicting nominals (categoricals)
PeriodicLoss, # losses for predicting periodic variables
evaluate, grad, M_estimator, # methods on losses
avgerror, scale, mul!, *,
embedding_dim, get_yidxs, datalevels, domain
abstract type Loss end
# a DiffLoss is one in which l(u,a) = f(u-a) AND argmin f(x) = 0
# for example, QuadLoss(u,a)=(u-a)² and we can write f(x)=x² and x=u-a
abstract type DiffLoss<:Loss end
# a ClassificationLoss is one in which observed values are true = 1 or false = 0 = -1 AND argmin_a L(u,a) = u>=0 ? true : false
abstract type ClassificationLoss<:Loss end
# Single Dimensional losses are DiffLosses or ClassificationLosses, which allow optimized evaluate and grad functions
const SingleDimLoss = Union{DiffLoss, ClassificationLoss}
mul!(l::Loss, newscale::Number) = (l.scale = newscale; l)
scale(l::Loss) = l.scale
*(newscale::Number, l::Loss) = (newl = copy(l); mul!(newl, newscale))
*(l::Loss, newscale::Number) = (newl = copy(l); mul!(newl, newscale))
domain(l::Loss) = l.domain
### embedding dimensions: mappings from losses/columns of A to columns of Y
# default number of columns
# number of columns is higher for multidimensional losses
embedding_dim(l::Loss) = 1
embedding_dim(l::Array{LossSubtype,1}) where LossSubtype<:Loss = sum(map(embedding_dim, l))
# find spans of loss functions (for multidimensional losses)
function get_yidxs(losses::Array{LossSubtype,1}) where LossSubtype<:Loss
n = length(losses)
ds = map(embedding_dim, losses)
d = sum(ds)
featurestartidxs = cumsum(append!([1], ds))
# find which columns of Y map to which columns of A (for multidimensional losses)
U = Union{UnitRange{Int}, Int}
yidxs = Array{U}(undef, n)
for f = 1:n
if ds[f] == 1
yidxs[f] = featurestartidxs[f]
else
yidxs[f] = featurestartidxs[f]:featurestartidxs[f]+ds[f]-1
end
end
return yidxs
end
### promote integers to floats if given as the argument u
## causes ambiguity warnings
# evaluate(l::Loss, u::Number, a) = evaluate(l,convert(Float64,u),a)
# grad(l::Loss, u::Number, a) = grad(l,convert(Float64,u),a)
# evaluate{T<:Number}(l::Loss, u::Array{T,1}, a) = evaluate(l,convert(Array{Float64,1},u),a)
# grad{T<:Number}(l::Loss, u::Array{T,1}, a) = grad(l,convert(Array{Float64,1},u),a)
### -1,0,1::Int are translated to Booleans if loss is not defined on numbers
# convert(::Type{Bool}, x::Int) = x==1 ? true : (x==-1 || x==0) ? false : throw(InexactError("Bool method successfully overloaded by LowRankModels"))
myBool(x::Int) = x==1 ? true : (x==-1 || x==0) ? false : throw(InexactError())
evaluate(l::ClassificationLoss, u::Real, a::Int) = evaluate(l,u,myBool(a))
grad(l::ClassificationLoss, u::Real, a::Int) = grad(l,u,myBool(a))
M_estimator(l::ClassificationLoss, a::AbstractArray{Int,1}) = M_estimator(l,myBool(a))
### M-estimators
# The following is the M-estimator for loss functions that don't have one defined. It's also useful
# for checking that the analytic M_estimators are correct. To make sure this method is called instead
# of the loss-specific method (should only be done to test), simply pass the third paramter `test`.
# e.g. M_estimator(l,a) will call the implementation for l, but M_estimator(l,a,"test") will call the
# general-purpose optimizing M_estimator.
function M_estimator(l::Loss, a::AbstractArray; test="test")
# the function to optimize over
f = (u -> sum(map(ai->evaluate(l,u[1],ai), a))) # u is indexed because `optim` assumes input is a vector
# the gradient of that function
function g!(storage::Vector, u::Vector) # this is the format `optim` expects
storage[1] = sum(map(ai->grad(l,u[1],ai), a))
end
m = optimize(f, g!, [median(a)], LBFGS()).minimum[1]
end
# Uses uₒ = argmin ∑l(u,aᵢ) to find (1/n)*∑l(uₒ,aᵢ) which is the
# average error incurred by using the estimate uₒ for every aᵢ
function avgerror(l::Loss, a::AbstractArray)
b = collect(skipmissing(a))
m = M_estimator(l,b)
sum(map(ai->evaluate(l,m,ai),b))/length(b)
end
## Losses:
########################################## QUADRATIC ##########################################
# f: ℜxℜ -> ℜ
mutable struct QuadLoss<:DiffLoss
scale::Float64
domain::Domain
end
QuadLoss(scale=1.0::Float64; domain=RealDomain()) = QuadLoss(scale, domain)
evaluate(l::QuadLoss, u::Real, a::Number) = l.scale*(u-a)^2
grad(l::QuadLoss, u::Real, a::Number) = 2*(u-a)*l.scale
M_estimator(l::QuadLoss, a::AbstractArray) = mean(a)
########################################## L1 ##########################################
# f: ℜxℜ -> ℜ
mutable struct L1Loss<:DiffLoss
scale::Float64
domain::Domain
end
L1Loss(scale=1.0::Float64; domain=RealDomain()) = L1Loss(scale, domain)
evaluate(l::L1Loss, u::Real, a::Number) = l.scale*abs(u-a)
grad(l::L1Loss, u::Real, a::Number) = sign(u-a)*l.scale
M_estimator(l::L1Loss, a::AbstractArray) = median(a)
########################################## HUBER ##########################################
# f: ℜxℜ -> ℜ
mutable struct HuberLoss<:DiffLoss
scale::Float64
domain::Domain
crossover::Float64 # where QuadLoss loss ends and linear loss begins; =1 for standard HuberLoss
end
HuberLoss(scale=1.0::Float64; domain=RealDomain(), crossover=1.0::Float64) = HuberLoss(scale, domain, crossover)
function evaluate(l::HuberLoss, u::Real, a::Number)
abs(u-a) > l.crossover ? (abs(u-a) - l.crossover + l.crossover^2)*l.scale : (u-a)^2*l.scale
end
grad(l::HuberLoss,u::Real,a::Number) = abs(u-a)>l.crossover ? sign(u-a)*l.scale : (u-a)*l.scale
M_estimator(l::HuberLoss, a::AbstractArray) = median(a) # a heuristic, not the true estimator
########################################## QUANTILE ##########################################
# f: ℜxℜ -> ℜ
# define (u)_+ = max(u,0), (u)_- = max(-u,0) so (u)_+ + (u)_- = |u|
# f(u,a) = { quantile (a - u)_+ + (1-quantile) (a - u)_-
# fits the `quantile`th quantile of the distribution
mutable struct QuantileLoss<:DiffLoss
scale::Float64
domain::Domain
quantile::Float64 # fit the alphath quantile
end
QuantileLoss(scale=1.0::Float64; domain=RealDomain(), quantile=.5::Float64) = QuantileLoss(scale, domain, quantile)
function evaluate(l::QuantileLoss, u::Real, a::Number)
diff = a-u
diff > 0 ? l.scale * l.quantile * diff : - l.scale * (1-l.quantile) * diff
end
function grad(l::QuantileLoss,u::Real,a::Number)
diff = a-u
diff > 0 ? -l.scale * l.quantile : l.scale * (1-l.quantile)
end
M_estimator(l::QuantileLoss, a::AbstractArray) = quantile(a, l.quantile)
########################################## PERIODIC ##########################################
# f: ℜxℜ -> ℜ
# f(u,a) = w * (1 - cos((a-u)*(2*pi)/T))
# this measures how far away u and a are on a circle of circumference T.
mutable struct PeriodicLoss<:DiffLoss
T::Float64 # the length of the period
scale::Float64
domain::Domain
end
PeriodicLoss(T, scale=1.0::Float64; domain=PeriodicDomain(T)) = PeriodicLoss(T, scale, domain)
evaluate(l::PeriodicLoss, u::Real, a::Number) = l.scale*(1-cos((a-u)*(2*pi)/l.T))
grad(l::PeriodicLoss, u::Real, a::Number) = -l.scale*((2*pi)/l.T)*sin((a-u)*(2*pi)/l.T)
function M_estimator(l::PeriodicLoss, a::AbstractArray{<:Real})
(l.T/(2*pi))*atan( sum(sin(2*pi*a/l.T)) / sum(cos(2*pi*a/l.T)) ) + l.T/2 # not kidding.
# this is the estimator, and there is a form that works with weighted measurements (aka a prior on a)
# see: http://www.tandfonline.com/doi/pdf/10.1080/17442507308833101 eq. 5.2
end
########################################## POISSON ##########################################
# f: ℜxℕ -> ℜ
# BEWARE:
# 1) this is a reparametrized poisson: we parametrize the mean as exp(u) so that u can take any real value and still produce a positive mean
# 2) THIS LOSS MAY CAUSE MODEL INSTABLITY AND DIFFICULTY FITTING.
mutable struct PoissonLoss<:Loss
scale::Float64
domain::Domain
end
PoissonLoss(max_count=2^31::Int; domain=CountDomain(max_count)::Domain) = PoissonLoss(1.0, domain)
function evaluate(l::PoissonLoss, u::Real, a::Number)
l.scale*(exp(u) - a*u + (a==0 ? 0 : a*(log(a)-1))) # log(a!) ~ a==0 ? 0 : a*(log(a)-1)
end
grad(l::PoissonLoss, u::Real, a::Number) = l.scale*(exp(u) - a)
M_estimator(l::PoissonLoss, a::AbstractArray) = log(mean(a))
########################################## ORDINAL HINGE ##########################################
# f: ℜx{min, min+1... max-1, max} -> ℜ
mutable struct OrdinalHingeLoss<:Loss
min::Integer
max::Integer
scale::Float64
domain::Domain
end
OrdinalHingeLoss(m1, m2, scale=1.0::Float64; domain=OrdinalDomain(m1,m2)) = OrdinalHingeLoss(m1,m2,scale,domain)
# this method should never be called directly but is needed to support copying
OrdinalHingeLoss() = OrdinalHingeLoss(1, 10, 1.0, OrdinalDomain(1,10))
OrdinalHingeLoss(m2) = OrdinalHingeLoss(1, m2, 1.0, OrdinalDomain(1, m2))
function evaluate(l::OrdinalHingeLoss, u::Real, a::Number)
#a = round(a)
if u > l.max-1
# number of levels higher than true level
n = min(floor(u), l.max-1) - a
loss = n*(n+1)/2 + (n+1)*(u-l.max+1)
elseif u > a
# number of levels higher than true level
n = min(floor(u), l.max) - a
loss = n*(n+1)/2 + (n+1)*(u-floor(u))
elseif u > l.min+1
# number of levels lower than true level
n = a - max(ceil(u), l.min+1)
loss = n*(n+1)/2 + (n+1)*(ceil(u)-u)
else
# number of levels higher than true level
n = a - max(ceil(u), l.min+1)
loss = n*(n+1)/2 + (n+1)*(l.min+1-u)
end
return l.scale*loss
end
function grad(l::OrdinalHingeLoss, u::Real, a::Number)
#a = round(a)
if u > a
# number of levels higher than true level
n = min(ceil(u), l.max) - a
g = n
else
# number of levels lower than true level
n = a - max(floor(u), l.min)
g = -n
end
return l.scale*g
end
M_estimator(l::OrdinalHingeLoss, a::AbstractArray) = median(a)
########################################## LOGISTIC ##########################################
# f: ℜx{-1,1}-> ℜ
mutable struct LogisticLoss<:ClassificationLoss
scale::Float64
domain::Domain
end
LogisticLoss(scale=1.0::Float64; domain=BoolDomain()) = LogisticLoss(scale, domain)
evaluate(l::LogisticLoss, u::Real, a::Bool) = l.scale*log(1+exp(-(2a-1)*u))
grad(l::LogisticLoss, u::Real, a::Bool) = (aa = 2a-1; -aa*l.scale/(1+exp(aa*u)))
function M_estimator(l::LogisticLoss, a::AbstractArray{Bool,1})
d, N = sum(a), length(a)
log(N + d) - log(N - d) # very satisfying
end
########################################## WEIGHTED HINGE ##########################################
# f: ℜx{-1,1} -> ℜ
# f(u,a) = { w * max(1-a*u, 0) for a = -1
# = { c * w * max(1-a*u, 0) for a = 1
mutable struct WeightedHingeLoss<:ClassificationLoss
scale::Float64
domain::Domain
case_weight_ratio::Float64 # >1 for trues to have more confidence than falses, <1 for opposite
end
WeightedHingeLoss(scale=1.0; domain=BoolDomain(), case_weight_ratio=1.0) =
WeightedHingeLoss(scale, domain, case_weight_ratio)
HingeLoss(scale=1.0::Float64; kwargs...) = WeightedHingeLoss(scale; kwargs...) # the standard HingeLoss is a special case of WeightedHingeLoss
function evaluate(l::WeightedHingeLoss, u::Real, a::Bool)
loss = l.scale*max(1-(2*a-1)*u, 0)
if l.case_weight_ratio !==1. && a
loss *= l.case_weight_ratio
end
return loss
end
function grad(l::WeightedHingeLoss, u::Real, a::Bool)
an = (2*a-1) # change to {-1,1}
g = (an*u>=1 ? 0 : -an*l.scale)
if l.case_weight_ratio !==1. && a
g *= l.case_weight_ratio
end
return g
end
function M_estimator(l::WeightedHingeLoss, a::AbstractArray{Bool,1})
r = length(a)/length(filter(x->x>0, a)) - 1
if l.case_weight_ratio > r
m = 1.0
elseif l.case_weight_ratio == r
m = 0.0
else
m = -1.0
end
end
########################################## MULTINOMIAL ##########################################
# f: ℜx{1, 2, ..., max-1, max} -> ℜ
# f computes the (negative log likelihood of the) multinomial logit,
# often known as the softmax function
# f(u, a) = exp(u[a]) / (sum_{a'} exp(u[a']))
# = 1 / (sum_{a'} exp(u[a'] - u[a]))
mutable struct MultinomialLoss<:Loss
max::Integer
scale::Float64
domain::Domain
end
MultinomialLoss(m, scale=1.0::Float64; domain=CategoricalDomain(m)) = MultinomialLoss(m,scale,domain)
embedding_dim(l::MultinomialLoss) = l.max
datalevels(l::MultinomialLoss) = 1:l.max # levels are encoded as the numbers 1:l.max
function evaluate(l::MultinomialLoss, u::Array{<:Real,1}, a::Integer)
sumexp = 0 # inverse likelihood of observation
# computing soft max directly is numerically unstable
# instead note logsumexp(a_j) = logsumexp(a_j - M) + M
# and we'll pick a good big (but not too big) M
M = maximum(u) - u[a] # prevents overflow
for j in 1:length(u)
sumexp += exp(u[j] - u[a] - M)
end
loss = log(sumexp) + M
return l.scale*loss
end
function grad(l::MultinomialLoss, u::Array{<:Real,1}, a::Integer)
g = zeros(size(u))
# Using some nice algebra, you can show
g[a] = -1
# and g[b] = -1/sum_{a' \in S} exp(u[b] - u[a'])
# the contribution of one observation to one entry of the gradient
# is always between -1 and 0
for j in 1:length(u)
M = maximum(u) - u[j] # prevents overflow
sumexp = 0
for jp in 1:length(u)
sumexp += exp(u[jp] - u[j] - M)
end
g[j] += exp(-M)/sumexp
end
return l.scale*g
end
## we'll compute it via a stochastic gradient method
## with fixed step size
function M_estimator(l::MultinomialLoss, a::AbstractArray)
u = zeros(l.max)'
for i = 1:length(a)
ai = a[i]
u -= .1*grad(l, u, ai)
end
return u
end
########################################## One vs All loss ##########################################
# f: ℜx{1, 2, ..., max-1, max} -> ℜ
mutable struct OvALoss<:Loss
max::Integer
bin_loss::Loss
scale::Float64
domain::Domain
end
OvALoss(m::Integer, scale::Float64=1.0; domain=CategoricalDomain(m), bin_loss::Loss=LogisticLoss(scale)) = OvALoss(m,bin_loss,scale,domain)
OvALoss() = OvALoss(1) # for copying correctly
embedding_dim(l::OvALoss) = l.max
datalevels(l::OvALoss) = 1:l.max # levels are encoded as the numbers 1:l.max
function evaluate(l::OvALoss, u::Array{<:Real,1}, a::Integer)
loss = 0
for j in 1:length(u)
loss += evaluate(l.bin_loss, u[j], a==j)
end
return l.scale*loss
end
function grad(l::OvALoss, u::Array{<:Real,1}, a::Integer)
g = zeros(length(u))
for j in 1:length(u)
g[j] = grad(l.bin_loss, u[j], a==j)
end
return l.scale*g
end
function M_estimator(l::OvALoss, a::AbstractArray)
u = zeros(l.max)
for j = 1:l.max
u[j] = M_estimator(l.bin_loss, a==j)
end
return u
end
########################################## Bigger vs Smaller loss ##########################################
# f: ℜx{1, 2, ..., max-1} -> ℜ
mutable struct BvSLoss<:Loss
max::Integer
bin_loss::Loss
scale::Float64
domain::Domain
end
BvSLoss(m::Integer, scale::Float64=1.0; domain=OrdinalDomain(1,m), bin_loss::Loss=LogisticLoss(scale)) = BvSLoss(m,bin_loss,scale,domain)
BvSLoss() = BvSLoss(10) # for copying correctly
embedding_dim(l::BvSLoss) = l.max-1
datalevels(l::BvSLoss) = 1:l.max # levels are encoded as the numbers 1:l.max
function evaluate(l::BvSLoss, u::Array{<:Real,1}, a::Integer)
loss = 0
for j in 1:length(u)
loss += evaluate(l.bin_loss, u[j], a>j)
end
return l.scale*loss
end
function grad(l::BvSLoss, u::Array{<:Real,1}, a::Integer)
g = zeros(length(u))
for j in 1:length(u)
g[j] = grad(l.bin_loss, u[j], a>j)
end
return l.scale*g
end
function M_estimator(l::BvSLoss, a::AbstractArray)
u = zeros(l.max)
for j = 1:l.max-1
u[j] = M_estimator(l.bin_loss, a.>j)
end
return u
end
########################################## ORDERED LOGISTIC ##########################################
# f: ℜx{1, 2, ..., max-1, max} -> ℜ
# f computes the (negative log likelihood of the) multinomial logit,
# often known as the softmax function
# f(u, a) = exp(u[a]) / (sum_{a'} exp(u[a']))
mutable struct OrdisticLoss<:Loss
max::Integer
scale::Float64
domain::Domain
end
OrdisticLoss(m::Int, scale=1.0::Float64; domain=OrdinalDomain(1,m)) = OrdisticLoss(m,scale,domain)
embedding_dim(l::OrdisticLoss) = l.max
datalevels(l::OrdisticLoss) = 1:l.max # levels are encoded as the numbers 1:l.max
function evaluate(l::OrdisticLoss, u::Array{<:Real,1}, a::Integer)
diffusquared = u[a]^2 .- u.^2
M = maximum(diffusquared)
invlik = sum(exp, (diffusquared .- M))
loss = M + log(invlik)
return l.scale*loss
end
function grad(l::OrdisticLoss, u::Array{<:Real,1}, a::Integer)
g = zeros(size(u))
# Using some nice algebra, you can show
g[a] = 2*u[a]
sumexp = sum(map(j->exp(- u[j]^2), 1:length(u)))
for j in 1:length(u)
diffusquared = u[j]^2 .- u.^2
M = maximum(diffusquared)
invlik = sum(exp,(diffusquared .- M))
g[j] -= 2 * u[j] * exp(- M) / invlik
end
return l.scale*g
end
## we'll compute it via a stochastic gradient method
## with fixed step size
function M_estimator(l::OrdisticLoss, a::AbstractArray)
u = zeros(l.max)'
for i = 1:length(a)
ai = a[i]
u -= .1*grad(l, u, ai)
end
return u
end
#################### Multinomial Ordinal Logit #####################
# l: ℜ^{max-1} x {1, 2, ..., max-1, max} -> ℜ
# l computes the (negative log likelihood of the) multinomial ordinal logit.
#
# the length of the first argument u is one less than
# the number of levels of the second argument a,
# since the entries of u correspond to the division between each level
# and the one above it.
#
# XXX warning XXX
# the documentation in the comment below this point is defunct
#
# To yield a sensible pdf, the entries of u should be increasing
# (b/c they're basically the -log of the cdf at the boundary between each level)
#
# The multinomial ordinal logit corresponds to a likelihood p with
# p(u, a > i) ~ exp(-u[i]), so
# p(u, a) ~ exp(-u[1]) * ... * exp(-u[a-1]) * exp(u[a]) * ... * exp(u[end])
# = exp(- u[1] - ... - u[a-1] + u[a] + ... + u[end])
# and normalizing,
# p(u, a) = p(u, a) / sum_{a'} p(u, a')
#
# So l(u, a) = -log(p(u, a))
# = u[1] + ... + u[a-1] - u[a] - ... - u[end] +
# log(sum_{a'}(exp(u[1] + ... + u[a'-1] - u[a'] - ... - u[end])))
#
# Inspection of this loss function confirms that given u,
# the most probable value a is the index of the first
# positive entry of u
mutable struct MultinomialOrdinalLoss<:Loss
max::Integer
scale::Float64
domain::Domain
end
MultinomialOrdinalLoss(m::Int, scale=1.0::Float64; domain=OrdinalDomain(1,m)) = MultinomialOrdinalLoss(m,scale,domain)
MultinomialOrdinalLoss() = MultinomialOrdinalLoss(10) # for copying
embedding_dim(l::MultinomialOrdinalLoss) = l.max - 1
datalevels(l::MultinomialOrdinalLoss) = 1:l.max # levels are encoded as the numbers 1:l.max
function enforce_MNLOrdRules!(u; TOL=1e-3)
u[1] = min(-TOL, u[1])
for j=2:length(u)
u[j] = min(u[j], u[j-1]-TOL)
end
u
end
# argument u is a row vector (row slice of a matrix), which in julia is 2d
# todo: increase numerical stability
function evaluate(l::MultinomialOrdinalLoss, u::Array{<:Real,1}, a::Integer)
enforce_MNLOrdRules!(u)
if a == 1
return -l.scale*log(exp(0) - exp(u[1])) # (log(1 - exp(u[a] - 1)))
elseif a == l.max
return -l.scale*u[a-1]
else
return -l.scale*log(exp(u[a-1]) - exp(u[a])) # (u[a-1] + log(1 - exp(u[a] - u[a-1])))
end
end
function grad(l::MultinomialOrdinalLoss, u::Array{<:Real,1}, a::Integer)
enforce_MNLOrdRules!(u)
g = zeros(size(u))
if a == 1
g[1] = -exp(u[1])/(exp(0) - exp(u[1]))
# g[1] = 1/(1 - exp(-u[1]))
elseif a == l.max
g[a-1] = 1
else
# d = exp(u[a] - u[a-1])
# g[a] = d/(1-d)
# g[a-1] = - g[a] - 1
g[a] = -exp(u[a])/(exp(u[a-1]) - exp(u[a]))
g[a-1] = exp(u[a-1])/(exp(u[a-1]) - exp(u[a]))
end
return -l.scale*g
end
## we'll compute it via a stochastic gradient method
## with fixed step size
## (we don't need a hyper accurate estimate for this)
function M_estimator(l::MultinomialOrdinalLoss, a::AbstractVector)
u = zeros(l.max-1)'
for i = 1:length(a)
ai = a[i]
u -= .1*grad(l, u, ai)
end
return u
end
### convenience methods for evaluating and computing gradients on vectorized arguments
function evaluate(l::Loss, u::Array{<:Real,1}, a::AbstractVector)
@assert size(u) == size(a)
out = 0
for i=1:length(a)
out += evaluate(l, u[i], a[i])
end
return out
end
#Optimized vector evaluate on single-dimensional losses
function evaluate(l::SingleDimLoss, u::Vector{<:Real}, a::AbstractVector)
losseval = (x::Real, y::Number) -> evaluate(l, x, y)
mapped = fill!(similar(u),0.)
map!(losseval, mapped, u, a)
reduce(+, mapped)
end
# now for multidimensional losses
function evaluate(l::Loss, u::Array{<:Real,2}, a::AbstractVector)
# @show size(u,1)
# @show size(a)
@assert size(u,1) == length(a)
out = 0
for i=1:length(a)
out += evaluate(l, u[i,:], a[i])
end
return out
end
function grad(l::Loss, u::Array{<:Real,1}, a::AbstractVector)
@assert size(u) == size(a)
mygrad = zeros(size(u))
for i=1:length(a)
mygrad[i] = grad(l, u[i], a[i])
end
return mygrad
end
# Optimized vector grad on single-dimensional losses
function grad(l::SingleDimLoss, u::Vector{<:Real}, a::AbstractVector)
lossgrad = (x::Real,y::Number) -> grad(l, x, y)
mapped = fill!(similar(u),0.)
map!(lossgrad, mapped, u, a)
end
# now for multidimensional losses
function grad(l::Loss, u::Array{<:Real,2}, a::AbstractVector)
@assert size(u,1) == length(a)
mygrad = zeros(size(u))
for i=1:length(a)
mygrad[i,:] = grad(l, u[i,:], a[i])
end
return mygrad
end