forked from robjhyndman/non-negative-mint
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnonnegativemint.tex
1395 lines (1258 loc) · 131 KB
/
nonnegativemint.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[11pt]{article}
\usepackage{monashwp}
\usepackage{tikz,amsthm,booktabs,pmat,tabularx,threeparttable,algpseudocode,algorithm}
\usepackage{graphicx,url,paralist,mathtools,float,setspace,rotating,booktabs,longtable,caption,subcaption,pdflscape,threeparttable,enumerate,xpatch,tabularx}
%\mathtoolsset{showonlyrefs}
\usepackage[inline]{enumitem}
\usepackage[section]{placeins}
\bibliography{WickEtAl2018}
\bibliography{package}
\DeclareNameAlias{sortname}{last-first}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newcommand{\E}{\textnormal{E}}
\newcommand{\var}{\textnormal{var}}
\newcommand{\by}{\bm{y}}
\newcommand{\be}{\bm{e}}
\newcommand{\ba}{\bm{a}}
\newcommand{\bS}{\bm{S}}
\newcommand{\bC}{\bm{C}}
\newcommand{\bP}{\bm{P}}
\newcommand{\bW}{\bm{W}}
\newcommand{\bV}{\bm{V}}
\newcommand{\bOmega}{\bm{\Omega}}
\newcommand{\bI}{\bm{I}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bB}{\bm{B}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bSigma}{\bm{\Sigma}}
\newcommand{\bGamma}{\bm{\Gamma}}
\newcommand{\bepsilon}{\bm{\varepsilon}}
\newcommand{\info}{\mathcal{I}_t}
\newcommand{\Normal}{\mathcal{N}}
\newcommand{\0}{\phantom{0}}
\newtheorem{definition}{Definition}[section]
\newcommand*{\minidx}{\operatornamewithlimits{min}\limits}
\newcommand*{\maxidx}{\operatornamewithlimits{max}\limits}
% Defined to be used in algorithms
\makeatletter
\algrenewcommand\ALG@beginalgorithmic{\normalsize}
\renewcommand{\algorithmicrequire}{\scshape Initialization:}
\algrenewcommand\algorithmicforall[1]{{\scshape for} #1}
\algrenewcommand\algorithmicdo{{\scshape do}}
\algrenewcommand\algorithmicend[1]{{\scshape end}}
\algrenewcommand\algorithmicfor{{\scshape for}}
\algrenewcommand\algorithmicif{{\scshape if}}
\algrenewcommand\algorithmicthen{{\scshape then}}
\algrenewcommand\algorithmicelse[1]{{\scshape else}}
\newcommand{\myifthen}[1]{{\hspace{1.5cm}\algorithmicif} #1 \algorithmicthen}
\newcommand{\myifend}{\hspace{1.5cm}\algorithmicend \algorithmicindent {\scshape if}}
\newcommand{\myelse}{\hspace{1.5cm}\algorithmicelse}
\algrenewcommand\algorithmicrepeat{{\scshape repeat}}
\algrenewcommand\algorithmicuntil{{\scshape until}}
\algnewcommand{\And}{and}
\makeatother
\renewcommand{\thealgorithm}{\arabic{section}.\arabic{algorithm}}
% Shanika's comments
\usepackage[colorinlistoftodos,prependcaption,textsize=tiny]{todonotes}
\usepackage{xargs}
\newcommandx{\shani}[2][1=]{\todo[linecolor=blue,backgroundcolor=blue!25,bordercolor=blue,#1]{#2}}
\newcommandx{\rob}[2][1=]{\todo[linecolor=red,backgroundcolor=red!25,bordercolor=red,#1]{#2}}
\makeatletter
\newcommand{\multiline}[1]{%
\begin{tabularx}{\dimexpr\linewidth-\ALG@thistlm}[t]{@{}X@{}}
#1
\end{tabularx}
}
\makeatother
\title{Optimal non-negative forecast~reconciliation}
\author{Shanika L Wickramasuriya\\ Berwin A Turlach\\ Rob J Hyndman}
%\nocover
\nojel
\wp{15/19}
\def\blindtitle{}
\addresses{\textbf{Shanika L Wickramasuriya}\\
Department of Statistics,\\
University of Auckland,\\ Auckland, New Zealand.\\
Email: [email protected]\\
Corresponding author\\[1cm]
\textbf{Berwin A Turlach}\\
Centre for Applied Statistics,\\
The University of Western Australia,\\ Crawley, Australia.\\
Email: [email protected]\\[1cm]
\textbf{Rob J Hyndman}\\
Department of Econometrics and Business Statistics,\\
Monash University,\\ Melbourne, Australia.\\
Email: [email protected]\\[1cm]}
\lfoot{\sf Wickramasuriya, Turlach \& Hyndman: \today}
\begin{document}
\titlepage
\begin{abstract}
{\color{red}
The sum of forecasts of disaggregated time series are often required to equal the forecast of the aggregate, giving a set of coherent forecasts. The least squares solution for finding coherent forecasts uses a reconciliation approach known as MinT, proposed by \citet{Wick2018}. The MinT approach and its variants do not guarantee that the coherent forecasts are non-negative, even when all of the original forecasts are non-negative in nature. This has become a serious issue in applications that are inherently non-negative such as with sales data or tourism numbers. While overcoming this difficulty, we reconsider the least squares minimization problem with non-negativity constraints to ensure that the coherent forecasts are strictly non-negative.
The constrained quadratic programming problem is solved using three algorithms. They are the block principal pivoting (BPV) algorithm, projected conjugate gradient (PCG) algorithm, and scaled gradient projection (SGP) algorithm. A Monte Carlo simulation is performed to evaluate the computational performances of these algorithms as the number of time series increases. The results demonstrate that the BPV algorithm clearly outperforms the rest, and PCG is the second best. The superior performance of the BPV algorithm can be partially attributed to the alternative representation of the weight matrix in the MinT approach.
An empirical investigation is carried out to assess the impact of imposing non-negativity constraints on forecast reconciliation over the unconstrained method. It is observed that slight gains in forecast accuracy have occurred at the most disaggregated level. At the aggregated level slight losses are also observed. Although the gains or losses are negligible, the procedure plays an important role in decision and policy implementation processes.
}
\end{abstract}
\begin{keywords}
Aggregation, Australian tourism, Coherent forecasts, Contemporaneous error correlation, Forecast combinations, Least squares, Non-negative, Spatial correlations, Reconciliation.
\end{keywords}
\newpage
\section{Introduction}
Forecast reconciliation is the problem of ensuring that disaggregated forecasts add up to the corresponding forecasts of the aggregated time series. This is a common problem in manufacturing, for example, where time series of sales are disaggregated in several ways --- by region, product-type, and so on. There are often tens of thousands of forecasts at the most disaggregated level, and these are required to sum to give forecasts at higher levels of aggregation --- a property known as ``coherence''.
A simple solution would be to forecast the most disaggregated series and sum the results. However, this tends to give poor aggregate forecasts due to the low signal-to-noise ratio in the disaggregated time series. Instead, a better solution is to forecast all the series at all levels of aggregation, and then reconcile them so they are coherent; that is, so that the forecasts of the disaggregated series add up to the forecasts of the aggregated series. Least squares reconciliation was proposed by \citet{Hyndman2011}, whereby the reconciled forecasts are as close as possible (in the $L_2$ sense) to the original (``base'') forecasts subject to the aggregation constraint. This result was extended by \citet{Hyndman2016} to a larger class of reconciliation problems, and by \citet{Wick2018} who showed that the resulting reconciled forecasts are minimum variance unbiased estimators.
{\color{red}
Forecast reconciliation provides a simple approach to multivariate forecasting, especially when there are very large numbers of time series. It reduces the problem to a collection of univariate forecasting problems, followed by a reconciliation step which captures the relationships between the series by estimating the covariance matrix of the univariate forecast errors. There are many multivariate time series models available, that one might expect to perform better than univariate models. However, in practice univariate forecasting often outperforms multivariate forecasting because of the large number of unknown parameters to be estimated for these multivariate models, especially for high-dimensional time series \citep{Chatfield2000}. The estimation can also be prohibitive if there are insufficient number of observations. Even with a reasonable number of observations, the computing algorithms can be extremely slow. Some multivariate modelling approaches have been developed to avoid these problems; for example, factor models \citep[see][Econometrica]{Bai2003}. However, these are still more complicated to perform (we need to consider stationarity and cointegration issues, and finding the optimal number of factors might be difficult). One of the main reasons for using forecast reconciliation is to avoid multivariate forecasting.
}
Most of the applications that we have found are inherently non-negative in nature, where the time series take only non-negative values such as revenue, demand, and counts of people. In such circumstances, it is important to ensure that the reconciled forecasts are non-negative, as forecasters and practitioners need to be able to make meaningful managerial decisions. Unfortunately, the MinT approach proposed by \citet{Wick2018} and its variants fail to guarantee this property even when the base forecasts are non-negative.
This can be overcome by explicitly imposing the non-negativity constraints on the reconciliation procedure. One simple technique is to overwrite any negatives in the reconciled forecasts with zeros. However, the resulting approximate solution has ill-defined mathematical properties. This type of overwriting approximation method is used in alternating least squares estimations \citep{Berry2007, Karjalainen1991}. The problem of this method is that it does not necessarily lower the objective function in each iteration, and therefore the convergence of the solution to the least squares minimum is not guaranteed and {\color{red} the resulting forecasts can also be incoherent.}. There are available algorithms that solve these problems in a mathematically rigorous way; we discuss how these can be applied to the forecast reconciliation problem.
{\color{red} The purpose of non-negative forecast reconciliation is not necessarily to improve forecast accuracy but to align the forecasts with their use in practice. It is important to users that forecasts of inherently non-negative variables (such as sales volume or expenditure) are also non-negative. In cases where forecasts are provided at several levels of aggregation, it is also important that the forecasts are coherent. If forcing non-negative coherence on the forecasts improves accuracy, it is a side benefit, not the main purpose.}
% Section~\ref{sec:qpprob} demonstrates the minimization problem to be considered. Section~\ref{sec:quadalgo} provides a detailed discussion of several algorithms that are applicable in this context, after which their efficiency is evaluated on a set of Monte Carlo simulations and an empirical application in Sections~\ref{sec:MCNN} and~\ref{sec:AUSNN} respectively.
\section{The optimization problem}
\subsection{Notation and MinT reconciliation}
We follow the notation of \citet{Wick2018} and let $\bm{y}_t \in \mathbb{R}^m$ denote a vector of observations at time $t$ comprising all series of interest including both disaggregated and aggregated time series. \textcolor{red}{We place all the aggregated series series at the top of $\bm{y}_t$ and the disaggregated series at the bottom of $\bm{y}_t$.} We also define $\bm{b}_t \in \mathbb{R}^n$ to be the vector of the most disaggregated series at time~$t$. These two vectors are connected via $\bm{y}_t=\bm{S}\bm{b}_t$ where $\bm{S}$ is the ``summing'' matrix of order $m \times n$ showing how the various aggregated time series in $\bm{y}_t$ are constructed from the disaggregated series in $\bm{b}_t$. {\color{red} Specifically, the entries of the $\bm{S}$ matrix consist of 1s and 0s, where its $ij$th element is equal to one if the $i$th aggregated series consists of the $j$th most disaggregated series and zero otherwise for $i = 1, 2, \dots, m$ and $j = 1, 2, \dots, n$.}
Let $\hat{\bm{y}}_T(h)$ is a vector of original (base) $h$-step-ahead forecasts, made at time~$T$, stacked in the same order as $\bm{y}_t$. These will not generally be coherent. The least squares reconciled forecasts are given by
\begin{align*}
\tilde{\bm{y}}_{T}(h) = \bm{S}\tilde{\bm{b}}_{T}(h),
\end{align*}
where
\begin{align}
\tilde{\bm{b}}_{T}(h) = \left[(\bm{S}^\top\bm{\Lambda}_{h}^{-1}\bm{S})^{-1}\bm{S}^\top\bm{\Lambda}_{h}^{-1}\right]\hat{\bm{y}}_{T}(h),
\label{eq:glsform}
\end{align}
and $\bm{\Lambda}_h$ is a {\color{red} positive definite} weighting matrix. \citet{Wick2018} showed that setting $\bm{\Lambda}_h =\var[\bm{y}_{t+h} - \hat{\bm{y}}_{t}(h)\mid\bm{\mathcal{I}}_{t}]$ where $\bm{\mathcal{I}}_t = \{\bm{y}_{t}, \bm{y}_{t-1}, \dots \}$ to be the covariance matrix of the $h$-step-ahead base forecast errors minimizes the trace of
$\var[\bm{y}_{t+h} - \tilde{\bm{y}}_{t}(h)\mid \bm{\mathcal{I}}_{t}]$ amongst all possible unbiased reconciliations. {\color{red} Unless simple forecasting methods such as (seasonal) naive or linear trend models are used, it is unlikely that the covariance matrix of the base forecast errors is positive semi-definite.} They also derived an alternative expression for $\tilde{\bm{b}}_{T}(h)$:
\begin{align}
\tilde{\bm{b}}_{T}(h) = \left[\bm{J} - \bm{J}\bm{\Lambda}_h\bm{U}(\bm{U}^\top\bm{\Lambda}_h\bm{U})^{-1}\bm{U}^\top\right]\hat{\bm{y}}_{T}(h),
\label{eq:altrform}
\end{align}
where $\bm{J} = \begin{pmat}[|] \bm{0}_{n \times (m-n)} & \bI_{n}\cr\end{pmat}$,\quad
$\bm{U}^\top = \begin{pmat}[|]\bI_{m-n} & -\bC_{(m-n) \times n}\cr\end{pmat}$,
and $\bS = \begin{pmat}[{}]
\bC_{(m-n) \times n} \cr\-
\bI_{n} \cr
\end{pmat}$. The use of Eq.~\eqref{eq:altrform} is computationally less demanding especially for high-dimensional hierarchical time series. It needs only one matrix inversion of order $(m - n) \times (m - n)$, whereas Eq.~\eqref{eq:glsform} needs two matrix inversions of orders $m \times m$ and $n \times n$. Typically in many applications $m-n < n$.
{\color{red} Forecast reconciliation has some similarities with forecast combinations, but with the following differences:
\begin{enumerate*}[label=(\roman*)]
\item the combination weights are partially determined by the aggregation structure of the time series;
\item we only produce one forecast for each series;
\item the forecast of each series is a combination of the forecasts of all series;
\item the forecast weights are not restricted to add to 1 or to be non-negative.
\end{enumerate*}
Typically, in forecast combination problems, the combination weights are free to be selected or estimated by the user, there are multiple forecasts for each series, the final forecast of each series is a combination only of forecasts for that series, and the forecast weights are normally restricted to be non-negative and to sum to one.}
\subsection{A quadratic programming solution}
To ensure that all entries in $\tilde{\bm{y}}_{T}(h)$ are non-negative, it is sufficient to guarantee that all entries in $\tilde{\bm{b}}_{T}(h)$ are non-negative. Even though the solution of $\tilde{\bm{b}}_{T}(h)$ is derived based on a minimization of the variances of the reconciled forecast errors across the entire structure, it is also apparent from Eq.~\eqref{eq:glsform} that $\tilde{\bm{b}}_{T}(h)$ is the generalized least squares solution to the following regression problem:
\begin{align*}
\operatornamewithlimits{min}_{\mathring{\bm{b}}} \frac{1}{2}[\hat{\bm{y}}_{T}(h) - \bm{S}\mathring{\bm{b}}]^\top\bm{\Lambda}^{-1}_{h}[\hat{\bm{y}}_{T}(h) - \bm{S}\mathring{\bm{b}}] = \operatornamewithlimits{min}_{\mathring{\bm{b}}} \frac{1}{2}\mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\bm{S}\mathring{\bm{b}} - \mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h) + \frac{1}{2}\hat{\bm{y}}_{T}(h)^\top \bm{\Lambda}_{h}^{-1}\hat{\bm{y}}_{T}(h).
\end{align*}
This suggests that the non-negativity issue can be handled by solving the following quadratic programming problem:
\begin{align}
\label{eq:quadprog}
\operatornamewithlimits{min}_{\mathring{\bm{b}}} q(\mathring{\bm{b}}) \coloneqq \operatornamewithlimits{min}_{\mathring{\bm{b}}} \frac{1}{2}\mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\bm{S}\mathring{\bm{b}} & - \mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h) \nonumber \\
\text{s.t.} \hspace*{2.5mm} \mathring{\bm{b}} & \geq \bm{0},
\end{align}
where the inequalities in Eq.~\eqref{eq:quadprog} have to hold component-wise. The final non-negative reconciled forecasts are then
\begin{align*}
\breve{\bm{y}}_{T}(h) = \bm{S}\breve{\bm{b}}_{T}(h),
\end{align*}
where $\breve{\bm{b}}_{T}(h)$ is the solution to the quadratic programming problem in Eq.~\eqref{eq:quadprog}. This estimation problem is also referred to as ``non-negative least squares'' (NNLS).
There are a few important features of this minimization problem that are worth discussing. Consider the following statements:
\begin{quote}
A vector $\mathring{\bm{b}}$ is said to be feasible if it satisfies all of the constraints in the quadratic programming problem in Eq.~\eqref{eq:quadprog}. The feasible region is the set of all feasible vectors $\mathring{\bm{b}}$, and the quadratic programming problem is said to be feasible if the feasible region is non-empty.
\end{quote}
\begin{quote}
If $\bm{S}^\top\bm{\Lambda}_{h}^{-1}\bm{S}$ is a positive definite matrix, i.e., $\bm{x}^\top(\bm{S}^\top\bm{\Lambda}_{h}^{-1}\bm{S})\bm{x} > 0$ for all $\bm{x} \neq \bm{0}$, then the objective function of the minimization problem in Eq.~\eqref{eq:quadprog} is a strictly convex function.
\end{quote}
It is easy to show that $\bm{S}^\top\bm{\Lambda}_{h}^{-1}\bm{S}$ is a positive definite matrix by using the fact that the matrix $\bm{S}$ is of full column rank and assuming that $\bm{\Lambda}_{h}$ is positive definite. These simple non-negativity constraints will also ensure that the feasible region is non-empty. Therefore, the quadratic programming problem in Eq.~\eqref{eq:quadprog} has a unique global solution; i.e., there are no local minima apart from the global minimum \citep{Turl2015}.
Unlike \citet{Wick2018}, we will not impose a constraint of unbiasedness on the reconciled forecasts obtained as the solution of the minimization problem in Eq.~\eqref{eq:quadprog}.
The well-known quadratic programming problem in Eq.~\eqref{eq:quadprog} is easy to solve for small scale hierarchical or grouped structures using the \texttt{quadprog} package for R, which is designed to handle dense matrices \citep{quadprog2013}. However, we require matrices to be stored in a sparse format due to the large size of the structures that typically arise in forecast reconciliation.
\subsection{First-order optimality conditions}
We can derive first-order necessary conditions for $\breve{\bm{b}}_{T}(h)$ to minimize the NNLS problem.
Consider the Lagrangian function for the minimization problem in Eq.~\eqref{eq:quadprog}:
\begin{align*}
\mathcal{L}(\mathring{\bm{b}}, \bm{\lambda}) & = q(\mathring{\bm{b}}) - \bm{\lambda}^\top\mathring{\bm{b}},
\end{align*}
where $q(\mathring{\bm{b}}) = \frac{1}{2}\mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\bm{S}\mathring{\bm{b}} - \mathring{\bm{b}}^\top\bm{S}^\top\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h)$ and $\bm{\lambda}$ is a Lagrange multiplier vector. The Karush-Kuhn-Tucker (KKT) optimality conditions that need to be satisfied by $\breve{\bm{b}}_{T}(h)$ are
\begin{subequations}
\label{eq:optcond}
\begin{align}
\label{eq:difflagrange}
\nabla_{\mathring{\bm{b}}} \mathcal{L}[\breve{\bm{b}}_{T}(h), \bm{\lambda}^{*}] & = \bm{0}, \\
\breve{b}_{T, i}(h) & = 0, \qquad \forall i \in \mathcal{A}[\breve{\bm{b}}_{T}(h)], \\
\breve{b}_{T, i}(h) & > 0, \qquad \forall i \notin \mathcal{A}[\breve{\bm{b}}_{T}(h)], \\
\lambda^{*}_{i} & \geq 0, \qquad \forall i \in \mathcal{A}[\breve{\bm{b}}_{T}(h)], \\
\lambda^{*}_{i} & = 0, \qquad \forall i \notin \mathcal{A}[\breve{\bm{b}}_{T}(h)], \\
\lambda_{i}^{*} \breve{b}_{T, i}(h) & = 0, \qquad \forall i \in \{1, 2, \dots, n\}, \label{eq:kktcomple}
\end{align}
\end{subequations}
where $\breve{b}_{T, i}(h)$ is the $i$th component of $\breve{\bm{b}}_{T}(h)$ and $\mathcal{A}[\breve{\bm{b}}_{T}(h)]$ is referred to as the active set, and is defined as
\begin{align*}
\mathcal{A}[\breve{\bm{b}}_{T}(h)] = \left\{i \in (1, 2, \dots, n) \hspace*{2mm} \big|\hspace*{2mm} \breve{b}_{T, i}(h) = 0\right\}.
\end{align*}
The first optimality condition in Eq.~\eqref{eq:difflagrange} leads to $\bm{\lambda}^{*}$ being computed as
\begin{align*}
\bm{\lambda}^{*} = {\color{red} \nabla_{\breve{\bm{b}}}} q[\breve{\bm{b}}_{T}(h)] = \bm{S}^\top\bm{\Lambda}^{-1}_{h}\bm{S}\breve{\bm{b}}_{T}(h) - \bm{S}^\top\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h).
\end{align*}
The conditions given in Eq.~\eqref{eq:kktcomple} are referred as complementarity conditions. They indicate that at the optimal solution, either the $i$th constraint is active or $\lambda_{i}^{*} = 0$, or both. Specifically, it implies that the Lagrange multipliers that are associated with inactive inequality constraints are zero.
\section{Algorithms}\label{sec:quadalgo}
Since the pioneering work of \citet{Lawson1974}, a variety of methods for solving NNLS problems have been proposed. The following sections briefly explain a few of the algorithms that are suitable for solving large-scale problems, and how they would apply to the forecast reconciliation problem discussed here. A detailed review of methods for NNLS is given by \citet{Chen2009}.
\subsection{Block principal pivoting method}
The first widely used active set method for solving NNLS was that proposed by \citet{Lawson1974}. The basic idea of this method is to transform the inequality constrained least squares problem into a sequence of equality constrained problems.
Points on the boundary of the feasible region are denoted the ``active set'', while the remaining points (within the feasible region) are the ``passive set''. A shortcoming of the standard active set method is that the algorithm is initialized with an empty passive set, and only one variable is added from the active set to the passive set at each step. Although QR updating and down-dating techniques are used to speed up the computations, more iterations (in other words, more time) might be required to find the optimal active set when handling large scale NNLS problems.
% In the presence of multiple right hand side columns in the normal equations (in other words, multiple columns of $\hat{\bm{y}}_{T}(h)$ representing the base forecasts of different forecast horizons), the passive set that corresponds to each column may not be unique. \citet{Bro1997} and \citet{VanBenthem2004} further developed the standard active set method for handling multiple right hand side variables. Their algorithms precompute certain cross-product matrices that are used in the normal equation formulation of the least squares solution to minimize redundant computations.
%One possibility is to initialize the algorithm with a non-empty passive set. A simple way to introduce such a set is to solve the original unconstrained least squares problem and then overwrite any negative values with zeros. This type of initialization is used widely in the field of multivariate curve resolution \citep{Andrew1998, Gemperline2003}. \citet{Wang2012} pointed out that such a guess would be reasonable in cases where there are few active constraints or when non-negativity is enforced in order to remove any observational or measurement errors that may be present. They proposed an alternative initialization strategy by using the solution of gradient projection methods after a few iterations. They showed theoretically that the sequence of sets generated by the projection-based algorithm converges to the optimal passive set of the active set method. Furthermore, they compared their choice of initialization with that of the overwrite solution of unconstrained least squares, and found the former to be better.
One possibility to enhance the speed of the active set method involves including more than one variable from the active set. This should be handled carefully, as it could lead to endless loops in the algorithm. Thus, these types of methods, which are referred to as ``block principal pivoting methods'', include a procedure for selecting a group of variables to exchange, and a backup rule in order to ensure the finite termination of the algorithm \citep{Judice1994}.
%\citet{Portugal1994} and \citet{Cantarella2004}.
The procedure begins with the monotone linear complementarity problem (LCP) induced by the KKT optimal conditions given in Eq.~\eqref{eq:optcond}, which needs to be satisfied by the optimal solution.
The monotone linear complementarity problem is
\begin{align}
\mathring{\bm{g}} & = \bm{S}^\top[\bm{\Lambda}_{h}^{-1}\bm{S}\mathring{\bm{b}} - \bm{\Lambda}_{h}^{-1}\hat{\bm{y}}_{T}(h)],\label{eq:LCPgrad} \\
\mathring{\bm{g}} & \geq \bm{0}, \label{eq:LCPg} \\
\mathring{\bm{b}} & \geq \bm{0}, \label{eq:LCPb} \\
\mathring{b}_{i}\mathring{g}_{i} & = 0, \ \ i = 1, 2, \dots, n\label{eq:LCPprod},
\end{align}
where $\mathring{b}_{i}$ and $\mathring{g}_{i}$ are the $i$th components of the vectors $\mathring{\bm{b}}$ and $\mathring{\bm{g}}$ respectively. A point $(\mathring{\bm{b}}, \mathring{\bm{g}}) \in \mathbb{R}^{2n}$ is defined as a complementary solution if it satisfies Eq.~\eqref{eq:LCPgrad} and~\eqref{eq:LCPprod}.
Let the index set $\{1, 2, \dots, n\}$ be partitioned into two mutually exclusive subsets $F$ and $G$. Consider the partitions of $\mathring{\bm{b}}, \mathring{\bm{g}}$ and $\bm{S}$ according to the index sets $F$ and $G$ using the following notation:
\begin{align*}
\mathring{\bm{b}}_{F} & = [\mathring{b}_{i}]_{i \in F}, & \mathring{\bm{g}}_{F} & = [\mathring{g}_{i}]_{i \in F}, & \bm{S}_{F} & = [\bm{S}_{i}]_{i \in F}, \\
\mathring{\bm{b}}_{G} & = [\mathring{b}_{i}]_{i \in G}, & \mathring{\bm{g}}_{G} & = [\mathring{g}_{i}]_{i \in G}, & \bm{S}_{G} & = [\bm{S}_{i}]_{i \in G},
\end{align*}
where $\bm{S}_{i}$ is the $i$th column of $\bm{S}$.
The algorithm starts by assigning $\mathring{\bm{b}}_{G} = \bm{0}$ and $\mathring{\bm{g}}_{F} = \bm{0}$. This particular choice will ensure that Eq.~\eqref{eq:LCPprod} is always satisfied for any values of $\mathring{\bm{b}}_{F}$ and $\mathring{\bm{g}}_{G}$.
The computation of the remaining unknown quantities $\mathring{\bm{b}}_{F}$ and $\mathring{\bm{g}}_{G}$ can be carried out by solving the following unconstrained least squares problem:
{\color{red}
\begin{align}
\label{eq:basicquad}
\bar{\bm{b}}_{F} = \operatornamewithlimits{min}_{\mathring{\bm{b}}_{F}} \frac{1}{2}[\bm{S}_{F}\mathring{\bm{b}}_{F} - \hat{\bm{y}}_{T}(h)]^\top\bm{\Lambda}_{h}^{-1} [\bm{S}_{F}\mathring{\bm{b}}_{F} - \hat{\bm{y}}_{T}(h)]
%\operatornamewithlimits{min}_{\mathring{\bm{b}}_{F}} \frac{1}{2}\left\|\bm{S}_{F}\mathring{\bm{b}}_{F} - \hat{\bm{y}}_{T}(h)\right\|^{2}_{\bm{\Lambda}_{h}^{-1}},
\end{align}
}
and then setting
\begin{align}
\label{eq:basicgrad}
\bar{\bm{g}}_{G} = \bm{S}_{G}^\top[\bm{\Lambda}_{h}^{-1}\bm{S}_{F}\bar{\bm{b}}_{F} - \bm{\Lambda}_{h}^{-1}\hat{\bm{y}}_{T}(h)].
\end{align}
The solution pair ($\bar{\bm{b}}_{F}$, $\bar{\bm{g}}_{G}$) is referred to as a complementary basic solution. If a complementary basic solution satisfies $\bar{\bm{b}}_{F} \geq \bm{0}$ and $\bar{\bm{g}}_{G} \geq \bm{0}$, then we have reached at the optimal solution of the NNLS problem. Otherwise, the complementary basic solution is infeasible; i.e., there exists at least one $i \in F$ with $\bar{b}_{i} < 0$ or one $i \in G$ with $\bar{g}_{i} < 0$.
In the presence of an infeasible solution, we need to update sets $F$ and $G$ by exchanging the variables for which Eqs.~\eqref{eq:LCPg} or~\eqref{eq:LCPb} do not hold. Define the following two sets, which correspond to the infeasibilities in sets $F$ and $G$:
\begin{align}
\label{eq:indexset}
I_{1} = \{i \in F: \bar{b}_{i} < \varepsilon\} \qquad \text{and} \qquad I_{2} = \{i \in G: \bar{g}_{i} < \varepsilon\},
\end{align}
where $\varepsilon = 10^{-12}$. For a given $\bar{I}_{1} \subseteq I_{1}$ and $\bar{I}_{2} \subseteq I_{2}$, $F$ and $G$ can be updated according to the following rules:
\begin{align}
\label{eq:indexsetrev}
F = (F - \bar{I}_{1}) \cup \bar{I}_{2} \qquad \text{and} \qquad G = (G - \bar{I}_{2}) \cup \bar{I}_{1}.
\end{align}
The first block principal pivoting algorithm for solving a strictly monotonic LCP is due to the work of \citet{Kost1978}. Unfortunately, the use of blocks of variables for exchange can lead to a cycle, meaning that it is not guaranteed to provide the optimal solution. Although this occurs rarely, it is problematic \citep{Judice1989}. In a later paper, \citet{Judice1994} proposed an extension of this algorithm to include finite termination, by incorporating Murty's single principal pivoting algorithm. Even though the algorithm proposed by \citet{Murty1974} has finite termination, convergence can be slow for applications with large numbers of variables, as it changes only one variable per iteration. However, this algorithm is still beneficial for obtaining a complementary basic solution with a smaller number of infeasibilities than before. The steps of the hybrid algorithm are given in Algorithm~\ref{alg:BPV}. card($X$) denotes the cardinality of set $X$.
\begin{algorithm}
\caption{Block principal pivoting algorithm}
\label{alg:BPV}
\begin{spacing}{1.3}
\begin{algorithmic}[1]
\Require Let $F = \emptyset$, $G = \{1, 2, \dots, n\}$, $\mathring{\bm{b}} = \bm{0}$, $\mathring{\bm{g}} = -\bm{S}^\top\bm{\Lambda}_{h}^{-1}\hat{\bm{y}}_{T}(h)$, $p = \bar{p} \leq 10$, and ninf $= n + 1$, and let $\alpha$ be a permutation of the set $\{1, 2, \dots, n\}$.
\If {$(\mathring{\bm{b}}_{F} \geq \bm{0} \hspace{2mm} \& \hspace{2mm} \mathring{\bm{g}}_{G} \geq \bm{0})$} \label{line:bpvopt}
\State Terminate the algorithm and $\breve{\bm{b}} = (\mathring{\bm{b}}_{F}, \bm{0})$ is the unique global solution.
\Else{}
\State Define $I_{1}$ and $I_{2}$ as given in Eq.~\eqref{eq:indexset}.
\If {(card$(I_{1} \cup I_{2}) <$ ninf)}
\State Set ninf $= \textnormal{card}(I_{1} \cup I_{2})$, $p = \bar{p}$, $\bar{I}_{1} = I_{1}$ and $\bar{I}_{2} = I_{2}$.
\ElsIf {(card$(I_{1} \cup I_{2}) \geq \textnormal{ninf}$ and $p \geq 1$)}
\State Set $p = p - 1$, $\bar{I}_{1} = I_{1}$ and $\bar{I}_{2} = I_{2}$.
\ElsIf {(card$(I_{1} \cup I_{2}) \geq \textnormal{ninf}$ and $p = 0$)}
\State Set $\bar{I}_{1} = \{r\}$ and $\bar{I}_{2} = \emptyset$, if $r \in I_{1},$
\State \hspace{5.3mm} $\bar{I}_{1} = \emptyset $ and $\bar{I}_{2} = \{r\}$, if $r \in I_{2}$,
\State where $r$ is the last element of the set $I_{1} \cup I_{2}$ as for the order defined by $\alpha$.
\EndIf
\State Update $F$ and $G$ as given by Eq.~\eqref{eq:indexsetrev}.
\State Compute $\bar{\bm{b}}_{F}$ and $\bar{\bm{g}}_{G}$ using Eqs.~\eqref{eq:basicquad} and~\eqref{eq:basicgrad} respectively, and assign $\mathring{\bm{b}}_{F} = \bar{\bm{b}}_{F}$ and $\mathring{\bm{g}}_{G} = \bar{\bm{g}}_{G}$.
\State Return to line~\ref{line:bpvopt}.
\EndIf
\end{algorithmic}
\end{spacing}
\end{algorithm}
In particular, if $\bm{\Lambda}_{h}$ is a diagonal matrix with positive elements and $\hat{\bm{y}}_{T}(h) > \bm{0}$, Algorithm~\ref{alg:BPV} can start by defining the initial conditions as $
F = \{1, 2, \dots, n\},\ G = \emptyset,\ \mathring{\bm{b}} = \tilde{\bm{b}},\ \mathring{\bm{g}} = \bm{0},$
where $\tilde{\bm{b}}$ is the original unconstrained least squares solution with positive diagonal elements for $\bm{\Lambda}_{h}$.
Rather than selecting a subset of variables from $I_{1}$ and $I_{2}$, we speed up the computations by using the full sets of variables as $\bar{I}_{1}$ and $\bar{I}_{2}$ respectively. This is generally referred to as the ``full exchange rule''. The variable $p$ in Algorithm~\ref{alg:BPV} acts as a buffer for determining the number of full exchange rules that may be tried. The choice of $p$ plays an important rule. It should be fairly small in order to prevent unnecessary computations in which the full exchange rule is not effective. However, it should not be too small, otherwise Murty's method may be activated several times, thus reducing the efficiency of the algorithm. In general, $p = 3$ is a good choice \citep{Judice1994}.
%If the number of infeasible variables increases due to the exchange rule, then $p$ decreases by one. If the number of infeasible variables is still larger after $p - 1$ steps, then Murty's method is used until it reaches a complementary solution with a smaller number of infeasible variables than a certain required value that is stored in ninf. This will occur in a finite number of iterations, due to the finite termination property of the Murty's method. The algorithm then switches to the full exchange rule, and iterates until the number of infeasible variables reaches zero.
%The performance of the block pivoting algorithm depends heavily on the number of times the full exchange rule fails and it has to switch to Murty's method. Based on an extensive study, \citet{Kim2011} noted that Murty's method was not activated during the algorithm, which suggests that the full exchange rule will work effectively in practice.
%\citet{Kim2011} extended the ideas of \citet{Bro1997} and \citet{VanBenthem2004} used in the active set method to run the block principal pivoting algorithm efficiently for multiple right hand side columns in the normal equation; in other words, when multiple columns of ${\bm{y}}_{T}(h)$ exist.
% \subsection{Projection-based methods}
% This class includes a set of methods that can incorporate multiple active constraints at each iteration, using the gradient information. The gradient projection methods that belong to this class is commonly used in practice, and is extremely useful for obtaining a set of non-negative reconciled forecasts for hierarchical and grouped time series because, in this context, \begin{inparaenum}[(i)] \item the constraints defined are simple, and \item the matrices involved are large and/or sparse \citep{Nocedal2006}. \end{inparaenum} These facts facilitate the efficient performance of the computations, because iterations of the algorithm mainly involve projections onto the feasible set and the determination of a (steepest) descent direction.
%The following sections review the details of two algorithms that are of interest in this paper. The first algorithm combines the gradient projection with a conjugate gradient method in order to solve the non-negative least squares problem \citep{Nocedal2006}. The second algorithm uses the scaled gradient projection method, which differs from standard gradient projection methods by the presence of a scaling matrix that multiplies the gradient. The numerical experiments have shown that this method is also effective at handling large scale problems for a proper scaling matrix \citep{Bonettini2009}.
% There are also various other methods that fall into this class. For a detailed discussion, the interested reader is referred to the work of \citet{Chen2009} and the references provided therein.
\subsection{Gradient projection + conjugate gradient approach}
Each iteration of this algorithm is designed to follow two main steps. In the first step, the current feasible solution $\mathring{\bm{b}}$ is updated by searching along the steepest direction; in other words, the direction $-\mathring{\bm{g}}$ from $\mathring{\bm{b}}$. If the lower bound of the inequality constraints (i.e., $\bm{0}$) is encountered before a minimizer is found along the line, the search direction is ``bent'' to ensure that it remains feasible. The search is continued along the resulting piecewise-linear path in order to locate the first local minimizer of the objective function, $q$. This point is referred to as the Cauchy point. Based on the Cauchy point, it is possible to define a set of constraints that are active at this point. Hence, in the second step, a subproblem is solved by fixing the constraints of the active set to zero. The key steps of this method are given in Algorithm~\ref{alg:gradproj}. Refer \citet{Nocedal2006} for a detailed implementation of each step involved.
%\paragraph{Step 1: Computation of the Cauchy point}
%
%The search along the steepest direction from the current point $\ddot{\bm{b}}$ is denoted by $\ddot{\bm{b}}(s)$ and obtained by simply projecting the search direction at $\ddot{\bm{b}}$ onto the feasible region as follows:
%\begin{align*}
%\ddot{\bm{b}}(s) = (\ddot{\bm{b}} - s\ddot{\bm{g}})_{+},
%\end{align*}
%where $s \geq 0$ and $(\cdot)_{+}$ is the projection onto the non-negative orthant, defined as $(x)_{+} = \text{max}(0, x)$. It is a piecewise-linear function of $s$, and the objective function along this path $q[\ddot{\bm{b}}(s)]$ is quadratic in $s$ on every linear piece. Therefore, a suitable value for $s$ at each iteration can be found by searching along the path explicitly. The first local minimizer of $q[\ddot{\bm{b}}(s)]$ (in other words, the Cauchy point) is identified by examining each line segment that can be formed from $\ddot{\bm{b}}(s)$. The procedure is given below.
%\begin{enumerate}
% \item For each component, identify the value of $s$ that reaches the lower bound of zero when searching along the steepest direction. The value for each component is computed as
% \begin{align*}
% \ddot{s}_{i} & =
% \left\{\begin{array}{ll}
% \ddot{b}_{i} / \ddot{g}_{i}, & \mbox{$\text{if} \hspace{2mm} \ddot{g}_{i} > 0,$}\\
% \infty, & \mbox{otherwise}.
% \end{array}\right.
% \end{align*}
% For any $s$, the components of $\ddot{\bm{b}}(s)$ that retain feasibility are defined as
% \begin{align*}
% \ddot{b}_{i}(s) & =
% \left\{\begin{array}{ll}
% \ddot{b}_{i} - s\ddot{g}_{i}, & \mbox{$\text{if} \hspace{2mm} s \leq \ddot{s}_{i},$}\\
% \ddot{b}_{i} - \ddot{s}_{i}\ddot{g}_{i}, & \mbox{otherwise}.
% \end{array}\right.
% \end{align*}
% \item To obtain the first local minimizer along $\ddot{\bm{b}}(s)$, any duplicates or zeros are removed from the collection of $\ddot{s}_{i}$ values. The resulting unique set of breakpoints is then sorted in ascending order, giving $0 < \ddot{s}_{1} < \ddot{s}_{2} < \dots < \ddot{s}_{l}$. Each interval, $[0, \ddot{s}_{1}], [\ddot{s}_{1}, \ddot{s}_{2}], \dots, [\ddot{s}_{j-1}, \ddot{s}_{j}], \dots, [\ddot{s}_{l-1}, \ddot{s}_{l}]$, is evaluated sequentially, to locate the first local minimizer.
% \item Suppose that we could not locate the local minimizer up to $\ddot{s}_{j-1}$ and moved on to the next interval $[\ddot{s}_{j-1}, \ddot{s}_{j}]$. The line segment that passes through $[\ddot{\bm{b}}(\ddot{s}_{j-1}), \ddot{\bm{b}}(\ddot{s}_{j})]$ is given by
% \begin{align*}
% \ddot{\bm{b}}(s) = \ddot{\bm{b}}(\ddot{s}_{j-1}) + (\Delta s)\bm{p}^{j-1},
% \end{align*}
% where
% \begin{align*}
% \Delta s = s - \ddot{s}_{j-1} \in [0, \ddot{s}_{j} - \ddot{s}_{j-1}],
% \end{align*}
% and
% \begin{align*}
% p_{i}^{j-1} & =
% \left\{\begin{array}{ll}
% -\ddot{g}_{i}, & \mbox{$\text{if} \hspace{2mm} \ddot{s}_{j-1} < \ddot{s}_{i},$}\\
% 0, & \mbox{otherwise}.
% \end{array}\right.
% \end{align*}
% \item Substituting $\ddot{\bm{b}}(s)$ into the objective function $q$ and minimizing with respect to $\Delta s$ yields
% \begin{align*}
% \Delta s^{*} = -\frac{f'_{j-1}}{f''_{j-1}},
% \end{align*}
% where
% \begin{align*}
% f'_{j-1} & = \ddot{\bm{b}}(\ddot{s}_{j-1})'\bm{S}'\bm{\Lambda}_{h}^{-1}\bm{S}\bm{p}^{j-1} - \hat{\bm{y}}_{T}(h)'\bm{\Lambda}_{h}^{-1}\bm{S}\bm{p}^{j-1},\\
% f''_{j-1} & = (\bm{p}^{j-1})'\bm{S}'\bm{\Lambda}_{h}^{-1}\bm{S}\bm{p}^{j-1}.
% \end{align*}
% \begin{enumerate}
% \item If $f'_{j-1} > 0$, a local minimizer can be obtained at $s = \ddot{s}_{j-1}$.
% \item If $\Delta s^{*} \in [0, \ddot{s}_{j} - \ddot{s}_{j-1})$, there is a minimizer at $s = \ddot{s}_{j-1} + \Delta s^{*}$.
% \item Otherwise, consider the next interval $[\ddot{s}_{j}, \ddot{s}_{j+1}]$ and continue.
% \end{enumerate}
%\end{enumerate}
%
%\paragraph{Step 2: Solving the subproblem}
%
%Once the Cauchy point $\bm{b}^{c}$ has been identified, a set of active constraints that corresponds to $\bm{b}^{c}$ can be defined as
%\begin{align*}
%\mathcal{A}\left(\bm{b}^{c}\right) = \left\{i \in (1, 2, \dots, n) \hspace*{2mm}\big|\hspace*{2mm}b^{c}_{i} = 0 \right\}.
%\end{align*}
%
%This reduces the original minimization problem to be of the form
%\begin{align*}
%\operatornamewithlimits{min}_{\ddot{\bm{b}}} q(\ddot{\bm{b}}) \coloneqq \operatornamewithlimits{min}_{\ddot{\bm{b}}} \frac{1}{2}\ddot{\bm{b}}'\bm{S}'\bm{\Lambda}^{-1}_{h}\bm{S}\ddot{\bm{b}} & - \ddot{\bm{b}}'\bm{S}'\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h) \nonumber\\
%\text{s.t.} \qquad \ddot{b}_{i} & = 0, \hspace*{3mm} i \in \mathcal{A}(\bm{b}^{c}),\\
%\ddot{b}_{i} & \geq 0, \hspace*{3mm} i \notin \mathcal{A}(\bm{b}^{c}).
%\end{align*}
%
%As was pointed out by \citet{Nocedal2006}, it is sufficient to find an approximate non-negative solution $\ddot{\bm{b}}^{+}$ of the above minimization problem such that $q(\ddot{\bm{b}}^{+}) \leq q(\bm{b}^{c})$. In general, the conjugate gradient (CG) algorithm is preferred and more appropriate in this context, as the Jacobian of the above equality constraints and the corresponding null-space basis matrix have simple representations. Specifically, the projected conjugate gradient algorithm is used (as explained in Algorithm~\ref{alg:pcg}) with a preconditioner. The algorithm terminates as soon as a non-negative solution with a better objective function value is encountered.
%
%\begin{algorithm}
% \caption{Projected CG algorithm}
% \label{alg:pcg}
% \begin{spacing}{1.3}
% \begin{algorithmic}[1]
% \Require Assign $\ddot{\bm{b}}^{+} = \bm{b}^{c}$ and compute $\bm{r} = \bm{S}'\bm{\Lambda}^{-1}_{h}\bm{S}\ddot{\bm{b}}^{+} - \bm{S}'\bm{\Lambda}^{-1}_{h}\hat{\bm{y}}_{T}(h)$, $\bm{g}_{\textnormal{proj}} = \bm{P}_{\tiny \textnormal{proj}}\bm{r}$ and $\bm{d} = -\bm{g}_{\textnormal{proj}}$ for a given $\bm{P}_{\tiny \textnormal{proj}}$.
% \Repeat
% \State $\alpha = \bm{r}'\bm{g}_{\textnormal{proj}} / \bm{d}'\bm{S}'\bm{\Lambda}_{h}^{-1}\bm{S} \bm{d}.$
% \State $\ddot{\bm{b}}^{+} = \ddot{\bm{b}}^{+} + \alpha \bm{d}.$
% \State $\bm{r}^{+} = \bm{r} + \alpha \bm{S}'\bm{\Lambda}_{h}^{-1}\bm{S} \bm{d}.$
% \State $\bm{g}^{+} = \bm{P}_{\tiny \textnormal{proj}}\bm{r}^{+}.$
% \State $\beta = (\bm{r}^{+})'\bm{g}^{+} / \bm{r}'\bm{g}_{\textnormal{proj}}.$
% \State $\bm{d} = -\bm{g}^{+} + \beta \bm{d}.$
% \State $\bm{g}_{\textnormal{proj}} = \bm{g}^{+}.$
% \State $\bm{r} = \bm{r}^{+}.$
% \Until $\ddot{\bm{b}}^{+}$ is feasible and $q(\ddot{\bm{b}}^{+}) \leq q(\bm{b}^{c})$ is satisfied.
% \end{algorithmic}
% \end{spacing}
%\end{algorithm}
%
%In Algorithm~\ref{alg:pcg}, $\bm{P}_{\text{proj}}$ denotes the projection matrix involving a diagonal precondition matrix. It is defined as
%\begin{align*}
%\bm{P}_{\text{proj}} & =
%\left\{\begin{array}{ll}
%\left[\left(\bm{S}'\bm{\Lambda}_{h}^{-1}\bm{S}\right)_{ii}\right]^{-1}, & \mbox{$\text{if} \hspace{2mm} i \notin \mathcal{A}(\bm{b}^{c}),$}\\
%0, & \mbox{$\text{otherwise}$}.
%\end{array}\right.
%\end{align*}
%
%The gradient projection method using the Cauchy point to solve the non-negative quadratic programming problem is summarized in Algorithm~\ref{alg:gradproj}.
\begin{algorithm}
\caption{Gradient projection based on the Cauchy point}
\label{alg:gradproj}
\begin{spacing}{1.3}
\begin{algorithmic}[1]
\Require Choose a feasible initial solution $\mathring{\bm{b}}^{0}$.
\ForAll{k in 0, 1, 2, \dots}
\If{$\mathring{\bm{b}}^{k}$ satisfies the KKT conditions}
\State Terminate the algorithm. $\breve{\bm{b}} = \mathring{\bm{b}}^{k}$ is the unique global solution.
\Else {}
\State Find the Cauchy point $\bm{b}^{c}$ using $\mathring{\bm{b}}^{k}$.
\State \multiline{Use projected conjugate gradient algorithm with a diagonal preconditioner to find an approximate feasible solution $\mathring{\bm{b}}^{+}$ that satisfies $q(\mathring{\bm{b}}^{+}) \leq q(\bm{b}^{c})$.}
\State $\mathring{\bm{b}}^{k+1} = \mathring{\bm{b}}^{+}$.
\EndIf
\EndFor
\end{algorithmic}
\end{spacing}
\end{algorithm}
\subsection{Scaled gradient projection}
The diagonally scaled gradient projection algorithm was proposed by \citet{Bonettini2009}. The algorithm propagates by determining a descent direction at each iteration, based on the current feasible solution, step length and scaling matrix. The solution vector is then adjusted along this direction using a non-monotone line search that does not guarantee a decrease in the objective function value at each iteration. This is in order to increase the likelihood of locating a global optimum in practice \citep{Birgin2003}. The main steps are given in Algorithm~\ref{alg:sgp}.
\begin{algorithm}
\caption{Diagonally scaled gradient projection algorithm}
\label{alg:sgp}
\begin{spacing}{1.3}
\begin{algorithmic}[1]
\Require Choose a feasible initial solution $\mathring{\bm{b}}^{0}$. Set the parameters $\eta, \theta \in (0, 1)$, $0 < \alpha_{\text{min}} < \alpha_{\text{max}}$, and a positive integer $M$. Use a diagonal scaling matrix $\bm{D} = \text{diag}(d_1, d_2, \dots, d_n)$ with elements $d_i = 1 / (\bS^\top\bm{\Lambda}_h^{-1}\bS)_{ii}$ for $i = 1, 2, \dots, n$.
\ForAll{k in 0, 1, 2, \dots}
\State\multiline{Choose the step-length parameter $\alpha_{k} \in [\alpha_{\text{min}}, \alpha_{\text{max}}]$ using the alternation strategy proposed by \citet{Bonettini2009}.}
\State{Projection: $\bm{z}^{k} = [\mathring{\bm{b}}^{k} - \alpha_{k}\bm{D}\mathring{\bm{g}}(\bm{b}^{k})]_{+}$, where $(x)_+ = \text{max}(0, x)$.}
\If{$\bm{z}^{k} = \mathring{\bm{b}}^{k}$}
\State Terminate the algorithm. $\breve{\bm{b}} = \bm{b}^{k}$ is the unique global minimum.
\Else{}
\State{Descent direction: $\bm{d}^{k} = \bm{z}^{k} - \mathring{\bm{b}}^{k}$.}
\State{Set $\lambda_{k} = 1$ and $q_{\text{max}} = \maxidx\limits_{j \in 0, 1, \dots, \textnormal{min}(k, M-1)}q(\mathring{\bm{b}}^{k-j})$} \vspace*{1.5mm}
\State{Backtracking loop:} \label{line:bcktck}
\Statex \myifthen{$q(\mathring{\bm{b}}^{k} + \lambda_{k} \bm{d}^{k}) \leq q_{\text{max}} + \eta \lambda_{k} [\mathring{\bm{g}}(\mathring{\bm{b}}^{k})]^\top\bm{d}^{k}$}
\Statex \hspace{2cm} Go to line~\ref{line:update}.
\Statex \myelse {}
\Statex \hspace{2cm} Set $\lambda_{k} = \theta \lambda_{k}$ and go to line~\ref{line:bcktck}.
\Statex \myifend
\State Set $\mathring{\bm{b}}^{k+1} = \mathring{\bm{b}}^{k} + \lambda_{k} \bm{d}^{k}$. \label{line:update}
\EndIf
\EndFor
\end{algorithmic}
\end{spacing}
\end{algorithm}
In selecting the step-length parameter for the scaled gradient projection algorithm, \citet{Bonettini2009} extended the original ideas of \citet{Barzilai1988} that are commonly used for improving the convergence rate of standard gradient methods. The generalized Barzilai \& Borwein (BB) rules are
\begin{align*}
\alpha_{k}^{BB1} & = \frac{(\bm{u}^{k-1})^\top\bm{D}^{-1} \bm{D}^{-1} \bm{u}^{k-1}}{(\bm{u}^{k-1})^\top\bm{D}^{-1}\bm{v}^{k-1}}, \\
\alpha_{k}^{BB2} & = \frac{(\bm{u}^{k-1})^\top\bm{D}\bm{v}^{k-1}}{(\bm{v}^{k-1})^\top\bm{D}\bm{D}\bm{v}^{k-1}},
\end{align*}
where $\bm{u}^{k-1} = \mathring{\bm{b}}^{k} - \mathring{\bm{b}}^{k-1}$ and $\bm{v}^{k-1} = \mathring{\bm{g}}(\mathring{\bm{b}}^{k}) - \mathring{\bm{g}}(\mathring{\bm{b}}^{k-1})$. Specifically, these equations reduce to the standard BB rules when $\bm{D} = \bm{I}_{n}$.
When $[\mathring{\bm{b}}^{k} - \alpha_{k} \bm{D} \mathring{\bm{g}}(\mathring{\bm{b}}^{k})]_{+}$ is used to generate a descent direction from $\mathring{\bm{b}}^{k}$, the diagonal scaling matrix with $\bm{\Lambda}_{h} \propto \bm{I}_{n}$ and the generalized BB rules indicate that the whole process reduces to the use of a standard gradient projection method with standard BB rules. Thus, the scaled gradient projection algorithm might not be computationally advantageous for obtaining non-negative reconciled forecasts from the OLS approach proposed by \citet{Hyndman2011}.
\subsubsection{Selection of tuning parameters}
We now discuss the roles of the tuning parameters and their recommended values.
\begin{compactitem}
\item $\alpha_0$: We consider a method similar to that of \citet{Figueiredo2007}, implemented in standard gradient projection methods, within the context of the scaled gradient projection methods. It considers the initial value $\alpha_{0}$ as the exact minimizer of the objective function along the direction of $\mathring{\bm{b}}^{0} - \alpha \bm{D} \mathring{\bm{g}}(\mathring{\bm{b}}^{0})$, if no constraints are to be satisfied. This involves defining
\begin{align*}
\bm{p}^{0} = \left(p^{0}_{i}\right) & =
\left\{\begin{array}{ll}
[\mathring{\bm{g}}(\mathring{\bm{b}}^{0})]_{i}, & \mbox{$\text{if} \hspace{2mm} \mathring{b}_{i}^{0} > 0 \hspace{2mm} \text{or} \hspace{2mm} [\mathring{\bm{g}}(\mathring{\bm{b}}^{0})]_{i} < 0,$} \\
0, & \mbox{\hspace{1cm} \text{otherwise}}.
\end{array}\right.
\end{align*}
The initial guess is estimated as
\begin{align*}
\alpha_{0} = \operatornamewithlimits{min}_{\alpha} q[\mathring{\bm{b}}^{0} - \alpha \bm{D} \mathring{\bm{g}}(\mathring{\bm{b}}^{0})],
\end{align*}
and can be computed analytically using
\begin{align*}
\alpha_{0} = \frac{(\bm{p}^{0})^\top\bm{D}\bm{p}^{0}}{(\bm{S}\bm{D}\bm{p}^{0})^\top\bm{\Lambda}_{h}^{-1}(\bm{S}\bm{D}\bm{p}^{0})}.
\end{align*}
\item $\eta$ and $\theta$ control the amount by which the objective function should be decreased and the number of backtracking reductions to be performed, respectively. The values of $\eta = 10^{-4}$ and $\theta = 0.4$ have been used in order to get a sufficiently large step size with fewer reductions \citep{Bonettini2009}.
\item $\alpha_{\text{min}}$ and $\alpha_{\text{max}}$ are the lower and upper bounds of the step-length parameter $\alpha_{k},$ to avoid using unnecessary extreme values. Even though a large range is defined for the BB-type rules in practice, \citet{Bertero2013} found the interval ($10^{-5}$, $10^{5}$) to be suitable for the generalized BB rules.
\item $\tau_{1}$ is the initial switching condition that activates the step-length alternation strategy, and a value of 0.5 is suitable in many applications \citep{Bonettini2009, Bertero2013}.
\item We set $M_{\alpha} = 3$, as was used by \citet{Bonettini2009} and \citet{Bertero2013}.
\item $M$ determines the monotone ($M =1$) or non-monotone ($M > 1$) line search to be performed in the backtracking loop. This value should not be too large, as the decrease in the objective function is difficult to control, and is set to $10$ here, as was done by \citet{Bonettini2009}.
\end{compactitem}
\section{Monte Carlo experiments}
\label{sec:MCNN}
Monte Carlo experiments can help demonstrate the practical usefulness of the aforementioned algorithms for obtaining a set of non-negative reconciled forecasts. For the sake of simplicity, the WLS based on structural weights (WLS$_{s}$) approach is considered \citep{Wick2018}. The computational efficiency of these algorithms is evaluated over a series of hierarchies, ranging in size from small to large. We study the behaviours of two possible choices of the initial solution: \begin{inparaenum}[(i)] \item base forecasts at the bottom level; and \item the unconstrained WLS$_{s}$ forecasts. \end{inparaenum} The latter choice can sometimes be a better alternative than the former, as it is aggregate consistent, and computationally less demanding. For the projection-based approaches, the unconstrained WLS$_{s}$ forecasts are projected on to the non-negative orthant, as these algorithms need a feasible initial solution. However, the block principal pivoting algorithm can use the unconstrained solution in its original condition. The acronyms used to distinguish the algorithms and their variations of interest are listed in Table~\ref{tbl:acronn}.
\begin{table}[ht]
\centering
\caption{Acronyms for NNLS algorithms.}
\label{tbl:acronn}
\begin{tabular}{lr}
\toprule
Algorithm & Notation \\
\midrule
Scaled gradient projection & SGP \\[0.1cm]
Gradient projection + conjugate gradient & PCG \\[0.1cm]
Block principal pivoting & BPV \\
\bottomrule
\end{tabular}
\end{table}
All experiments are performed in R on a Linux machine equipped with a 3.20GHz Intel Quad-core processor and 8GB memory. A parallel computing environment with two workers is established by using the \texttt{doParallel} \citep{doparallel2015} and \texttt{foreach} \citep{foreach2015} packages in R to parallelize the procedure of computing non-negative reconciled forecasts for different forecast horizons.
Initially, we consider a hierarchy with $K=1$ level, having three series at the bottom level. The base forecast for the most aggregated series in the hierarchy is generated from a uniform distribution on the interval $(1.5e^{K}, 2e^{K})$, where $K$ is the number of levels in the hierarchy. This is then disaggregated to the bottom level based on a set of proportions that sum to one. Specifically, a set of values is chosen from a gamma distribution, with the shape and scale parameters set to two, and the values are normalized to ensure that they sum to one. Noise is then added to the series at the aggregated levels to make them aggregate-inconsistent. If any of the base forecasts become negative after the noise is added, they are set to zero in order to ensure that all base forecasts in the hierarchy are strictly non-negative. The whole procedure is repeated until there is at least one negative reconciled forecast at the bottom level, and six of these sets with negative reconciled forecasts are used to denote a forecast horizon of length 6. The number of levels in the hierarchy is increased gradually to construct much larger hierarchies, by adding a mixture of three and four nodes to the bottom-level nodes in the preceding hierarchy. Table~\ref{tbl:negstrwls} presents the structure of each hierarchy constructed and the number of negative reconciled forecasts at the bottom level for each forecast horizon. Each hierarchy contains approximately 5--10\% of negative reconciled forecasts at the bottom level. These are then revised using the algorithms discussed in Section~\ref{sec:quadalgo}, in order to obtain a set of non-negative reconciled forecasts.
\begin{table}[ht]
% \tabcolsep=0.18cm
\caption{The structure of each hierarchy generated and the numbers of negative reconciled forecasts that result from the WLS$_{s}$ approach.}
\label{tbl:negstrwls}
\centering
\begin{threeparttable}
\begin{tabular}{rrrrrrrrr}
\toprule
& & & \multicolumn{6}{c}{Forecast horizon $(h)$} \\[-0.3cm]\\\cline{4-9}\\[-0.3cm]
$K$ & $m$ & $n$ & 1 & 2 & 3 & 4 & 5 & 6 \\
\midrule
1 & 4 & 3 & 1 & 1 & 1 & 1 & 1 & 1 \\
2 & 14 & 10 & 1 & 1 & 1 & 2 & 1 & 1 \\
3 & 49 & 35 & 1 & 1 & 1 & 1 & 2 & 1 \\
4 & 171 & 122 & 1 & 1 & 3 & 5 & 2 & 4 \\
5 & 598 & 427 & 13 & 10 & 10 & 9 & 6 & 7 \\
6 & 2092 & 1494 & 62 & 38 & 60 & 56 & 44 & 43 \\
7 & 7321 & 5229 & 211 & 229 & 255 & 270 & 222 & 203 \\
8 & 25622 & 18301 & 1128 & 1166 & 1026 & 1268 & 1285 & 1186 \\
9 & 89675 & 64053 & 4738 & 5619 & 6244 & 4812 & 5779 & 5637 \\
10 & 249808 & 160133 & 17744 & 17790 & 16023 & 17261 & 19462 & 17258 \\
11 & 650141 & 400333 & 36271 & 47845 & 44040 & 40790 & 47879 & 38753 \\
12 & 1650974 & 1000833 & 105462 & 84998 & 76817 & 102130 & 95530 & 80090 \\
\bottomrule
\end{tabular}
\begin{tablenotes}
\item [] Note: For $K > 9$, either two or three nodes are added to each of the bottom-level nodes in the preceding hierarchy.
\end{tablenotes}
\end{threeparttable}
\end{table}
\begin{table}[ht]
% \fontsize{9}{12}
\small
\tabcolsep=0.20cm
\caption{Computational efficiency of the non-negative forecast reconciliation from the WLS$_{s}$ approach using base forecasts as the initial solution.}
\label{tbl:perfnnwlsb}
\centering
\begin{threeparttable}
\begin{tabular}{llllrrrrrrr}
\toprule
& & & & \multicolumn{6}{c}{Forecast horizon $(h)$} \\[-0.3cm]\\\cline{5-10}\\[-0.3cm]
$K$ & $m$ & $n$ & & 1 & 2 & 3 & 4 & 5 & 6 & Time $(s)$ \\
\midrule
1 & 4 & 3 & WLS$_{s}$ & & & & & & & 0.01 \\
& & & SGP & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$0.04$} \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.04$} \\
\midrule
2 & 14 & 10 & WLS$_{s}$ & & & & & & & 0.01 \\
& & & SGP & 342 & 265 & 23 & 481 & 22 & 497 & 1.52 \\
& & & PCG & 1 & 2 & 1 & 1 & 1 & 2 & \bm{$0.10$} \\
\midrule
3 & 49 & 35 & WLS$_{s}$ & & & & & & & 0.01 \\
& & & SGP & 276 & 461 & 391 & 392 & 737 & 423 & 1.86 \\
& & & PCG & 2 & 2 & 2 & 1 & 1 & 2 & \bm{$0.17$} \\
\midrule
4 & 171 & 122 & WLS$_{s}$ & & & & & & & 0.01 \\
& & & SGP & 456 & 496 & 389 & 480 & 364 & 498 & 2.13 \\
& & & PCG & 2 & 2 & 2 & 2 & 2 & 2 & \bm{$0.29$} \\
\midrule
5 & 598 & 427 & WLS$_{s}$ & & & & & & & 0.01 \\
& & & SGP & 375 & 357 & 379 & 352 & 359 & 270 & 1.94 \\
& & & PCG & 2 & 3 & 2 & 3 & 2 & 3 & \bm{$0.33$} \\
\midrule
6 & 2092 & 1494 & WLS$_{s}$ & & & & & & & 0.02 \\
& & & SGP & 1239 & 354 & 419 & 7803 & 354 & 382 & 29.11 \\
& & & PCG & 5 & 6 & 6 & 6 & 4 & 6 & \bm{$0.51$} \\
\midrule
7 & 7231 & 5229 & WLS$_{s}$ & & & & & & & 0.06 \\
& & & SGP & 1020 & 851 & 10$^{4*}$ & 10$^{4*}$ & 614 & 369 & 103.16 \\
& & & PCG & 9 & 8 & 8 & 9 & 7 & 7 & \bm{$2.09$} \\
\midrule
8 & 25622 & 18301 & WLS$_{s}$ & & & & & & & 0.19 \\
& & & PCG & 13 & 14 & 13 & 13 & 13 & 12 & \bm{$19.08$} \\
\midrule
9 & 89675 & 64053 & WLS$_{s}$ & & & & & & & 0.48 \\
& & & PCG & 16 & 16 & 16 & 17 & 17 & 21 & \bm{$244.68$} \\
\midrule
10 & 249808 & 160133 & WLS$_{s}$ & & & & & & & 1.43 \\
& & & PCG & 19 & 20 & 19 & 17 & 20 & 20 & \bm{$1660.12$} \\
% 11 & 265720 & 177147 & OLS & & & & & & & \\
% & & & PCG & & & & & & & \\
\bottomrule
\end{tabular}
\begin{tablenotes}
\item [] Notes: WLS$_{s}$ defines the unconstrained WLS$_{s}$ approach.
\item [] The computational time is averaged over 50 replications for $K = 1$ to $K = 9$, but only 10 for $K = 10$, as the computational time is considerable.
\item [] Only PCG is performed up to $K = 10$, due to the high computational time.
\end{tablenotes}
\end{threeparttable}
\end{table}
\begin{table}[ht]
%\fontsize{9}{12}
\small
\tabcolsep=0.20cm
\captionsetup{belowskip=0pt, aboveskip=4pt}
\caption{Computational efficiency of the non-negative forecast reconciliation from the WLS$_{s}$ approach using (projected) unconstrained WLS$_{s}$ forecasts as the initial solution.}
\label{tbl:perfnnwlsp}
\centering
\begin{threeparttable}
\begin{tabular}{llllrrrrrrr}
\toprule
& & & & \multicolumn{6}{c}{Forecast horizon $(h)$} \\[-0.4cm]\\\cline{5-10}\\[-0.3cm]
$K$ & $m$ & $n$ & & 1 & 2 & 3 & 4 & 5 & 6 & Time $(s)$ \\
\midrule
1 & 4 & 3 & SGP & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.03$} \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.03$} \\
& & & BPV & 1 & 1 & 1 & 1 & 1 & 1 & 0.06 \\
\midrule
2 & 14 & 10 & SGP & 287 & 17 & 22 & 512 & 357 & 35 & 0.88 \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 2 & 0.09 \\
& & & BPV & 1 & 1 & 1 & 1 & 1 & 2 & \bm{$0.08$} \\
\midrule
3 & 49 & 35 & SGP & 290 & 347 & 297 & 374 & 621 & 388 & 1.80 \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & 0.13 \\
& & & BPV & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.07$} \\
\midrule
4 & 171 & 122 & SGP & 515 & 312 & 423 & 431 & 368 & 12 & 2.02 \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & 0.17 \\
& & & BPV & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.09$} \\
\midrule
5 & 598 & 427 & SGP & 455 & 380 & 363 & 334 & 419 & 368 & 2.27 \\
& & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & 0.22 \\
& & & BPV & 1 & 1 & 1 & 1 & 2 & 1 & \bm{$0.12$} \\
\midrule
6 & 2092 & 1494 & SGP & 1214 & 282 & 424 & 5947 & 390 & 340 & 22.32 \\
& & & PCG & 3 & 2 & 2 & 2 & 2 & 1 & 0.36 \\
& & & BPV & 2 & 2 & 1 & 2 & 2 & 1 & \bm{$0.17$} \\
\midrule
7 & 7321 & 5229 & SGP & 879 & 663 & 10$^{4*}$ & 10$^{4*}$ & 766 & 374 & 101.72 \\
& & & PCG & 3 & 4 & 6 & 4 & 3 & 3 & 0.68 \\
& & & BPV & 2 & 2 & 2 & 2 & 2 & 2 & \bm{$0.57$} \\
\midrule
8 & 25622 & 18301 & PCG & 6 & 8 & 8 & 7 & 6 & 5 & 3.18 \\
& & & BPV & 2 & 2 & 2 & 2 & 3 & 2 & \bm{$1.76$} \\
\midrule
9 & 89675 & 64053 & PCG & 12 & 11 & 14 & 11 & 10 & 14 & 35.08 \\
& & & BPV & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$6.45$} \\
\midrule
10 & 249808 & 160133 & PCG & 16 & 16 & 18 & 17 & 17 & 16 & 250.22 \\
& & & BPV & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$21.00$} \\
\midrule
11 & 650141 & 400333 & PCG & 18 & 19 & 21 & 22 & 18 & 21 & 1597.10 \\
& & & BPV & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$56.84$} \\
\midrule
12 & 1650974 & 1000833 & BPV & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$3247.09$} \\
\bottomrule
\end{tabular}
\begin{tablenotes}
\item [] Notes: The computational time is averaged over 50 replications for $K = 1$ to $K = 9$, but only 10 for $K > 9$, as the computational time is considerable.
\item [] Only PCG is performed up to $K = 11$, due to the high computational time.
\end{tablenotes}
\end{threeparttable}
\end{table}
Tables~\ref{tbl:perfnnwlsb} and~\ref{tbl:perfnnwlsp} present the numbers of iterations and the average computational times in seconds $(s)$ that are required to reach the KKT optimality conditions when the base and projected (unconstrained) WLS$_{s}$ forecasts, respectively, are used as the initial solution. Cases where an algorithm reaches the maximum number of iterations $(10^{4})$ are marked with an asterisk, and the average computational time corresponding to that number of iterations is given. The first row in each hierarchy of Table~\ref{tbl:perfnnwlsb} gives the computational times required to produce the unconstrained WLS$_{s}$ reconciled forecasts, which is always the best time, because the weight matrix has an analytical expression and computes only ones for all forecast horizons. The bold entries identify the non-negative algorithms with the best computational performances.
The main conclusion that can be drawn from these sets of results is that the BPV algorithm always has the best computational performance. This is due in part to the alternative analytical representation proposed for MinT \citep{Wick2018}. In addition, it should be noted that the computational performance of BPV algorithm depends strongly on how often the full exchange rule fails and Murty's method or the back-up rule has to be activated. The back-up rule was inactive for all experiments carried out in this section; this is also observed in the tests performed by \citet{Kim2011}. However, there is no theoretical justification for this, nor do we have conditions under which we know that the back-up rule is always inactive. Hence, the performance of the algorithm will be affected slightly if it is activated in a certain application.
The second best timing is achieved by the PCG algorithm. Unfortunately, it is inefficient for larger hierarchies, as locating the Cauchy point can be time consuming. Of the two initial solutions considered, the (projected) unconstrained WLS$_s$ is a good choice, as expected.
We also evaluated the performance of these algorithms for obtaining non-negative reconciled forecasts using the OLS approach. The BPV and PCG algorithms showed similar performances as those observed with WLS$_s$. The SGP algorithm performed the worst; this is not surprising, as it reduces to using the standard gradient projection algorithm for the OLS approach.
To further examine the superiority of these competing algorithms on real applications, an extensive case study is provided in Section~\ref{sec:AUSNN}.
%\subsection{Non-negativity in the OLS approach}
%\label{sec:nonnegols}
%
%Initially, we consider a hierarchy with $K=1$ level, having three series at the bottom level. The base forecast for the most aggregated series in the hierarchy is generated from a uniform distribution on the interval $(e^{K}, 1.2e^{K})$, where $K$ is the number of levels in the hierarchy. This is then disaggregated to the bottom level based on a set of proportions that sum to one. Specifically, a set of values is chosen from a gamma distribution, with the shape and scale parameters set to two, and the values are normalized to ensure that they sum to one. Noise is then added to the series at the aggregated levels to make them aggregate-inconsistent. If any of the base forecasts become negative after the noise is added, they are set to zero in order to ensure that all base forecasts in the hierarchy are strictly non-negative. The whole procedure is repeated until there is at least one negative reconciled forecast at the bottom level, and six of these sets with negative reconciled forecasts are used to denote a forecast horizon of length 6. The number of levels in the hierarchy is increased gradually to construct much larger hierarchies, by adding three nodes below each of the bottom-level nodes in the preceding hierarchy. Table~\ref{tbl:negstrols} presents the structure of each hierarchy constructed and the number of negative reconciled forecasts at the bottom level for each forecast horizon. Each hierarchy constructed contains approximately 10\% of negative reconciled forecasts at the bottom level. These are then revised using the algorithms discussed in Section~\ref{sec:quadalgo}, in order to obtain a set of non-negative reconciled forecasts.
%
%\begin{table}[ht]
% % \tabcolsep=0.18cm
% \caption{The structure of each hierarchy generated and the number of negative reconciled forecasts that result from the OLS approach.}
% \label{tbl:negstrols}
% \centering
% \begin{threeparttable}
% \begin{tabular}{rrrrrrrrr}
% \toprule
% & & & \multicolumn{6}{c}{Forecast horizon $(h)$}\\[-0.3cm]\\\cline{4-9}\\[-0.3cm]
% $K$ & $m$ & $n$ & 1 & 2 & 3 & 4 & 5 & 6\\
% \midrule
% 1 & 4 & 3 & 1 & 1 & 1 & 1 & 1 & 1 \\
% 2 & 13 & 9 & 1 & 1 & 1 & 2 & 1 & 2\\
% 3 & 40 & 27 & 1 & 1 & 3 & 2 & 1 & 1\\
% 4 & 121 & 81 & 1 & 4 & 7 & 2 & 1 & 5 \\
% 5 & 364 & 243 & 9 & 12 & 11 & 15 & 4 & 18 \\
% 6 & 1093 & 729 & 50 & 45 & 36 & 39 & 37 & 48 \\
% 7 & 3280 & 2187 & 137 & 147 & 138 & 137 & 143 & 134 \\
% 8 & 9841 & 6561 & 532 & 476 & 572 & 490 & 524 & 520 \\
% 9 & 29524 & 19683 & 1604 & 1920 & 1589 & 1551 & 1778 & 1662 \\
% 10 & 88573 & 59049 & 6087 & 6106 & 5888 & 5199 & 5243 & 6193 \\
% 11 & 265720 & 177147 & 21696 & 19439 & 20935 & 19318 & 19121 & 21674 \\
% 12 & 797161 & 531441 & 64668 & 63866 & 71347 & 68648 & 63031 & 69600 \\
% \bottomrule
% \end{tabular}
% \begin{tablenotes}
% \item [] $K$: number of levels.
% \item [] $m$: total number of series.
% \item [] $n$: number of series at the bottom level.
% \end{tablenotes}
% \end{threeparttable}
%\end{table}
%
%\begin{table}[ht]
% % \fontsize{9}{12}
% \small
% \tabcolsep=0.20cm
% \caption{Computational efficiency of the non-negative forecast reconciliation from the OLS approach using base forecasts as the initial solution.}
% \label{tbl:perfnnolsb}
% \centering
% \begin{threeparttable}
% \begin{tabular}{llllrrrrrrr}
% \toprule
% & & & & \multicolumn{6}{c}{Forecast horizon $(h)$}\\[-0.3cm]\\\cline{5-10}\\[-0.3cm]
% $K$ & $m$ & $n$ & & 1 & 2 & 3 & 4 & 5 & 6 & Time $(s)$\\
% \midrule
% 1 & 4 & 3 & OLS & & & & & & & 0.01\\
% & & & SGP & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$0.03$} \\
% & & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.03$} \\
% \midrule
% 2 & 13 & 9 & OLS & & & & & & & 0.01 \\
% & & & SGP & 495 & 435 & 431 & 659 & 35 & 517 & 2.34 \\
% & & & PCG & 1 & 2 & 1 & 2 & 2 & 2 & \bm{$0.11$} \\
% \midrule
% 3 & 40 & 27 & OLS & & & & & & & 0.01 \\
% & & & SGP & 268 & 299 & 476 & 272 & 333 & 416 & 1.69 \\
% & & & PCG & 2 & 2 & 3 & 3 & 2 & 1 & \bm{$0.15$} \\
% \midrule
% 4 & 121 & 81 & OLS & & & & & & & 0.01 \\
% & & & SGP & 384 & 387 & 10$^{4*}$ & 7700 & 510 & 518 & 21.85 \\
% & & & PCG & 4 & 6 & 9 & 4 & 3 & 7 & \bm{$0.30$} \\
% \midrule
% 5 & 364 & 243 & OLS & & & & & & & 0.01 \\
% & & & SGP & 10$^{4*}$ & 4381 & 976 & 6114 & 2948 & 2524 & 30.44 \\
% & & & PCG & 9 & 11 & 12 & 15 & 7 & 16 & \bm{$0.53$} \\
% \midrule
% 6 & 1093 & 729 & OLS & & & & & & & 0.02 \\
% & & & SGP & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 74.22 \\
% & & & PCG & 49 & 41 & 44 & 43 & 41 & 44 & \bm{$1.32$} \\
% \midrule
% 7 & 3280 & 2187 & OLS & & & & & & & 0.03 \\
% & & & SGP & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 77.53 \\
% & & & PCG & 133 & 139 & 168 & 113 & 136 & 137 & \bm{$4.48$} \\
% \midrule
% 8 & 9841 & 6561 & OLS & & & & & & & 0.09 \\
% & & & PCG & 461 & 445 & 440 & 384 & 423 & 374 & \bm{$26.25$} \\
% \midrule
% 9 & 29524 & 19683 & OLS & & & & & & & 0.19 \\
% & & & PCG & 1296 & 1663 & 1206 & 1359 & 1625 & 1388 & \bm{$227.40$} \\
% \midrule
% 10 & 88573 & 59049 & OLS & & & & & & & 0.55 \\
% & & & PCG & 5792 & 4046 & 3684 & 3551 & 4385 & 4100 & \bm{$3274.80$}\\
% % 11 & 265720 & 177147 & OLS & & & & & & & \\
% % & & & PCG & & & & & & & \\
% \bottomrule
% \end{tabular}
% \begin{tablenotes}
% \item [] Notes: OLS defines the unconstrained OLS approach.
% \item [] The computational time is averaged over 50 replications for $K = 1$ to $K = 9$, but only 10 for $K = 10$, as the computational time is considerable.
% \item [] Only PCG is performed up to $K = 10$, due to the high computational time.
% \end{tablenotes}
% \end{threeparttable}
%\end{table}
%
%\begin{table}[ht]
% %\fontsize{9}{12}
% \small
% \tabcolsep=0.20cm
% \caption{Computational efficiency of the non-negative forecast reconciliation from the OLS approach using (projected) unconstrained OLS forecasts as the initial solution.}
% \label{tbl:perfnnolsp}
% \centering
% \begin{threeparttable}
% \begin{tabular}{llllrrrrrrr}
% \toprule
% & & & & \multicolumn{6}{c}{Forecast horizon $(h)$}\\[-0.4cm]\\\cline{5-10}\\[-0.3cm]
% $K$ & $m$ & $n$ & & 1 & 2 & 3 & 4 & 5 & 6 & Time $(s)$\\
% \midrule
% 1 & 4 & 3 & SGP & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.03$} \\
% & & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.03$} \\
% & & & BPV & 1 & 1 & 1 & 1 & 1 & 1 & 0.06 \\
% \midrule
% 2 & 13 & 9 & SGP & 451 & 27 & 430 & 692 & 11 & 488 & 1.75 \\
% & & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.07$} \\
% & & & BPV & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.07$} \\
% \midrule
% 3 & 40 & 27 & SGP & 395 & 323 & 548 & 274 & 339 & 383 & 1.91 \\
% & & & PCG & 1 & 1 & 2 & 1 & 1 & 1 & 0.12 \\
% & & & BPV & 1 & 1 & 2 & 1 & 1 & 1 & \bm{$0.09$} \\
% \midrule
% 4 & 121 & 81 & SGP & 339 & 348 & 10$^{4*}$ & 2833 & 535 & 448 & 22.09\\
% & & & PCG & 1 & 1 & 2 & 2 & 1 & 1 & 0.20 \\
% & & & BPV & 1 & 2 & 3 & 2 & 1 & 2 & \bm{$0.12$} \\
% \midrule
% 5 & 364 & 243 & SGP & 10$^{4*}$ & 10$^{4*}$ & 1715 & 10$^{4*}$ & 529 & 7930 & 64.04 \\
% & & & PCG & 1 & 1 & 1 & 4 & 1 & 3 & 0.39 \\
% & & & BPV & 1 & 1 & 1 & 2 & 1 & 2 & \bm{$0.12$} \\
% \midrule
% 6 & 1093 & 729 & SGP & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 75.79\\
% & & & PCG & 38 & 9 & 13 & 11 & 15 & 18 & 1.04 \\
% & & & BPV & 2 & 2 & 2 & 2 & 3 & 2 & \bm{$0.19$} \\
% \midrule
% 7 & 3280 & 2187 & SGP & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 10$^{4*}$ & 85.74\\
% & & & PCG & 49 & 49 & 71 & 54 & 64 & 43 & 2.64 \\
% & & & BPV & 2 & 2 & 3 & 3 & 3 & 3 & \bm{$0.31$} \\
% \midrule
% 8 & 9841 & 6561 & PCG & 255 & 201 & 287 & 218 & 196 & 213 & 17.58 \\
% & & & BPV & 3 & 3 & 3 & 2 & 3 & 2 & \bm{$1.13$} \\
% \midrule
% 9 & 29524 & 19683 & PCG & 699 & 781 & 704 & 630 & 807 & 679 & 182.34 \\
% & & & BPV & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$2.80$} \\
% \midrule
% 10 & 88573 & 59049 & PCG & 2622 & 2501 & 2166 & 2448 & 2081 & 2583 & 2302.11 \\
% & & & BPV & 4 & 4 & 4 & 4 & 4 & 3 & \bm{$9.33$} \\
% \midrule
% 11 & 265720 & 177147 & BPV & 4 & 4 & 4 & 4 & 4 & 4 & \bm{$28.99$}\\
% \midrule
% 12 & 797161 & 531441 & BPV & 4 & 4 & 4 & 4 & 4 & 5 & \bm{$98.57$}\\
% \bottomrule
% \end{tabular}
% \begin{tablenotes}
% \item [] Notes: The computational time is averaged over 50 replications for $K = 1$ to $K = 9$, but only 10 for $K > 9$, as the computational time is considerable.
% \item [] Only PCG is performed up to $K = 10$, due to the high computational time.
% \end{tablenotes}
% \end{threeparttable}
%\end{table}
%
%
%Tables~\ref{tbl:perfnnolsb} and~\ref{tbl:perfnnolsp} present the numbers of iterations and average computational times in seconds $(s)$ that are required to reach the KKT optimality conditions given in Eq.~\eqref{eq:optcond} when the base and (projected) unreconciled OLS forecasts, respectively, are used as the initial solution. Cases where an algorithm reaches the maximum number of iterations considered (10$^{4}$) are marked with an asterisk, and the average time corresponding to that number of iterations is given. The first row in each hierarchy of Table~\ref{tbl:perfnnolsb} gives the computational time required to produce the unconstrained OLS reconciled forecasts, which is always the best time, because the weights matrix computes only ones for all forecast horizons. The bold entries identify the non-negative algorithms with the best computational performances.
%
%The main conclusion that can be drawn from these sets of results is that the BPV algorithm always has the best computational performance. This is due in part to the alternative analytical representation proposed for MinT \citep{Wick2018}. In addition, it should be noted that the computational performance of BPV algorithm depends strongly on how often the full exchange rule fails and Murty's method or the back-up rule has to be activated. Interestingly, the back-up rule was inactive for all experiments carried out in this section. This is also observed in the tests performed by \citet{Kim2011}. However, there is no theoretical justification for this, nor do we have conditions under which we know that the back-up rule is always inactive. Hence, the performance of the algorithm will be affected slightly if it is activated in a certain application.
%
%The second best timing is achieved by the PCG algorithm. Unfortunately, it is inefficient for larger hierarchies, as locating the Cauchy point can be time consuming. Of the two initial solutions considered, the (projected) unconstrained OLS is a good choice, as expected. The scaled gradient projection algorithm performed the worst. This is not surprising, as it reduces to using the standard gradient projection algorithm for the OLS approach.
%\subsection[Non-negativity in the WLSs approach]{Non-negativity in the WLS$_{s}$ approach}
%\label{sec:nonnegwls}
%The simulation design of Section~\ref{sec:nonnegols} is biased against the scaled gradient projection algorithms. We therefore reassess the performances of the competing algorithms by changing the structure of the hierarchy and using WLS with structural weights as the forecast reconciliation approach.
%
%As with the previous simulation setup, we consider a hierarchy with $K=1$ level and three series at the bottom level. The base forecast for the most aggregated series in the hierarchy is generated from a uniform distribution on the interval $(1.5e^{K}, 2e^{K})$, where $K$ is the number of levels in the hierarchy. Then the same set of steps are followed as in Section~\ref{sec:nonnegols} to generate six sets of reconciled forecasts with negatives at the bottom level. The number of levels in the hierarchy is increased gradually to construct much larger hierarchies, by adding a mixture of three and four nodes to the bottom-level nodes in the preceding hierarchy. Table~\ref{tbl:negstrwls} presents the structure of each hierarchy constructed and the number of negative reconciled forecasts at the bottom level for each forecast horizon. Each hierarchy contains approximately 10\% of negative reconciled forecasts at the bottom level.
%
%\begin{table}[ht]
% % \tabcolsep=0.18cm
% \caption{The structure of each hierarchy generated and the numbers of negative reconciled forecasts that result from the WLS$_{s}$ approach.}
% \label{tbl:negstrwls}
% \centering
% \begin{threeparttable}
% \begin{tabular}{rrrrrrrrr}
% \toprule
% & & & \multicolumn{6}{c}{Forecast horizon $(h)$}\\[-0.3cm]\\\cline{4-9}\\[-0.3cm]
% $K$ & $m$ & $n$ & 1 & 2 & 3 & 4 & 5 & 6\\
% \midrule
% 1 & 4 & 3 & 1 & 1 & 1 & 1 & 1 & 1\\
% 2 & 14 & 10 & 1 & 1 & 1 & 2 & 1 & 1 \\
% 3 & 49 & 35 & 1 & 1 & 1 & 1 & 2 & 1 \\
% 4 & 171 & 122 & 1 & 1 & 3 & 5 & 2 & 4 \\
% 5 & 598 & 427 & 13 & 10 & 10 & 9 & 6 & 7 \\
% 6 & 2092 & 1494 & 62 & 38 & 60 & 56 & 44 & 43 \\
% 7 & 7321 & 5229 & 211 & 229 & 255 & 270 & 222 & 203 \\
% 8 & 25622 & 18301 & 1128 & 1166 & 1026 & 1268 & 1285 & 1186 \\
% 9 & 89675 & 64053 & 4738 & 5619 & 6244 & 4812 & 5779 & 5637 \\
% 10 & 249808 & 160133 & 17744 & 17790 & 16023 & 17261 & 19462 & 17258 \\
% 11 & 650141 & 400333 & 36271 & 47845 & 44040 & 40790 & 47879 & 38753 \\
% 12 & 1650974 & 1000833 & 105462 & 84998 & 76817 & 102130 & 95530 & 80090 \\
% \bottomrule
% \end{tabular}
% \begin{tablenotes}
% \item [] Note: For $K > 9$, either two or three nodes are added to each of the bottom-level nodes in the preceding hierarchy.
% \end{tablenotes}
% \end{threeparttable}
%\end{table}
%
%
%\begin{table}[ht]
% % \fontsize{9}{12}
% \small
% \tabcolsep=0.20cm
% \caption{Computational efficiency of the non-negative forecast reconciliation from the WLS$_{s}$ approach using base forecasts as the initial solution.}
% \label{tbl:perfnnwlsb}
% \centering
% \begin{threeparttable}
% \begin{tabular}{llllrrrrrrr}
% \toprule
% & & & & \multicolumn{6}{c}{Forecast horizon $(h)$}\\[-0.3cm]\\\cline{5-10}\\[-0.3cm]
% $K$ & $m$ & $n$ & & 1 & 2 & 3 & 4 & 5 & 6 & Time $(s)$\\
% \midrule
% 1 & 4 & 3 & WLS$_{s}$ & & & & & & & 0.01 \\
% & & & ABB & 3 & 3 & 3 & 3 & 3 & 3 & \bm{$0.04$} \\
% & & & PCG & 1 & 1 & 1 & 1 & 1 & 1 & \bm{$0.04$} \\
% \midrule
% 2 & 14 & 10 & WLS$_{s}$ & & & & & & & 0.01\\
% & & & ABB & 342 & 265 & 23 & 481 & 22 & 497 & 1.52 \\
% & & & PCG & 1 & 2 & 1 & 1 & 1 & 2 & \bm{$0.10$} \\
% \midrule
% 3 & 49 & 35 & WLS$_{s}$ & & & & & & & 0.01 \\
% & & & ABB & 276 & 461 & 391 & 392 & 737 & 423 & 1.86 \\
% & & & PCG & 2 & 2 & 2 & 1 & 1 & 2 & \bm{$0.17$} \\
% \midrule
% 4 & 171 & 122 & WLS$_{s}$ & & & & & & & 0.01 \\
% & & & ABB & 456 & 496 & 389 & 480 & 364 & 498 & 2.13 \\
% & & & PCG & 2 & 2 & 2 & 2 & 2 & 2 & \bm{$0.29$} \\
% \midrule
% 5 & 598 & 427 & WLS$_{s}$ & & & & & & & 0.01 \\
% & & & ABB & 375 & 357 & 379 & 352 & 359 & 270 & 1.94 \\
% & & & PCG & 2 & 3 & 2 & 3 & 2 & 3 & \bm{$0.33$} \\
% \midrule
% 6 & 2092 & 1494 & WLS$_{s}$ & & & & & & & 0.02 \\
% & & & ABB & 1239 & 354 & 419 & 7803 & 354 & 382 & 29.11\\
% & & & PCG & 5 & 6 & 6 & 6 & 4 & 6 & \bm{$0.51$} \\
% \midrule
% 7 & 7231 & 5229 & WLS$_{s}$ & & & & & & & 0.06 \\
% & & & ABB & 1020 & 851 & 10$^{4*}$ & 10$^{4*}$ & 614 & 369 & 103.16 \\
% & & & PCG & 9 & 8 & 8 & 9 & 7 & 7 & \bm{$2.09$} \\
% \midrule
% 8 & 25622 & 18301 & WLS$_{s}$ & & & & & & & 0.19 \\
% & & & PCG & 13 & 14 & 13 & 13 & 13 & 12 & \bm{$19.08$} \\
% \midrule
% 9 & 89675 & 64053 & WLS$_{s}$ & & & & & & & 0.48 \\
% & & & PCG & 16 & 16 & 16 & 17 & 17 & 21 & \bm{$244.68$} \\
% \midrule