From 05e7775f744858e838474d1cd248ffd0bb277d5c Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Wed, 1 Nov 2023 15:43:59 +0100
Subject: [PATCH 01/12] enable papi 6.0 or higher

---
 CMakeLists.txt                   | 2 +-
 test-suite/quantlibbenchmark.cpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dde66d4710..86cd7c62d98 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,7 +70,7 @@ option(QL_USE_STD_OPTIONAL "Use std::optional instead of boost::optional" OFF)
 option(QL_USE_STD_SHARED_PTR "Use standard smart pointers instead of Boost ones" OFF)
 option(QL_USE_STD_TUPLE "Use std::tuple instead of boost::tuple" ON)
 set(QL_EXTERNAL_SUBDIRECTORIES "" CACHE STRING "Optional list of external source directories to be added to the build (semicolon-separated)")
-set(QL_EXTRA_LINK_LIBRARIES "" CACHE STRING "Optional extra link libraries to add to QuantLib")
+set(QL_EXTRA_LINK_LIBRARIES "-lpapi" CACHE STRING "Optional extra link libraries to add to QuantLib")
 
 # Require C++14 or higher
 if (NOT DEFINED CMAKE_CXX_STANDARD)
diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index b848f7f2f33..ada54d7cb32 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -40,7 +40,7 @@
  Example results can be found at https://openbenchmarking.org/test/pts/quantlib
 
  This benchmark is derived from quantlibtestsuite.cpp. Please see the
-  copyrights therein.
+ copyrights therein.
 */
 
 #include <ql/types.hpp>
@@ -74,7 +74,6 @@
 //#include <papi.h>
 
 
-
 /* Use BOOST_MSVC instead of _MSC_VER since some other vendors (Metrowerks,
    for example) also #define _MSC_VER
 */

From 5ce41a18d0d96670d8bb4a6c4ba73e5d86676250 Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Thu, 2 Nov 2023 00:47:38 +0100
Subject: [PATCH 02/12] proposal for new set of benchmarks

---
 test-suite/CMakeLists.txt        |  51 +++----
 test-suite/quantlibbenchmark.cpp | 220 ++++++++++---------------------
 2 files changed, 102 insertions(+), 169 deletions(-)

diff --git a/test-suite/CMakeLists.txt b/test-suite/CMakeLists.txt
index 0319b497b9f..5f323526e53 100644
--- a/test-suite/CMakeLists.txt
+++ b/test-suite/CMakeLists.txt
@@ -185,30 +185,37 @@ set(QL_TEST_HEADERS
 
 set(QL_BENCHMARK_SOURCES
     quantlibbenchmark.cpp
-
-    americanoption.cpp
-    asianoptions.cpp
-    barrieroption.cpp
-    basketoption.cpp
-    batesmodel.cpp
-    convertiblebonds.cpp
-    digitaloption.cpp
-    dividendoption.cpp
-    europeanoption.cpp
-    fdheston.cpp
-    hestonmodel.cpp
-    interpolations.cpp
-    jumpdiffusion.cpp
-    lowdiscrepancysequences.cpp
-    marketmodel_cms.cpp
-    marketmodel_smm.cpp
+    americanoption.cpp                  
+    andreasenhugevolatilityinterpl.cpp  
+    batesmodel.cpp                      
+    bermudanswaption.cpp                
+    cdo.cpp                             
+    cmsspread.cpp                       
+    convertiblebonds.cpp                
+    creditdefaultswap.cpp               
+    europeanoption.cpp                  
+    fdheston.cpp                        
+    fdmlinearop.cpp                     
+    hestonmodel.cpp                     
+    hestonslvmodel.cpp                  
+    linearleastsquaresregression.cpp   
+    lowdiscrepancysequences.cpp         
+    marketmodel_cms.cpp                 
+    marketmodel_smm.cpp                 
+    markovfunctional.cpp                
+    mclongstaffschwartzengine.cpp       
+    overnightindexedswap.cpp            
+    piecewiseyieldcurve.cpp             
+    riskstats.cpp                       
+    shortratemodels.cpp                 
+    swaptionvolatilitycube.cpp          
+    swingoption.cpp                     
     preconditions.cpp                   preconditions.hpp
-    quantooption.cpp
     quantlibglobalfixture.cpp			quantlibglobalfixture.hpp
-    riskstats.cpp
-    shortratemodels.cpp
     utilities.cpp                       utilities.hpp
-                                        swaptionvolstructuresutilities.hpp
+    variancegamma.cpp                   
+    vpp.cpp                            
+    zabr.cpp                           
 )
 
 if (QL_BUILD_TEST_SUITE)
@@ -228,7 +235,7 @@ if (QL_BUILD_TEST_SUITE)
     add_test(NAME quantlib_test_suite COMMAND ql_test_suite --log_level=message)
 endif()
 
-IF (QL_BUILD_BENCHMARK)
+if (QL_BUILD_BENCHMARK)
     add_executable(ql_benchmark ${QL_BENCHMARK_SOURCES})
     set_target_properties(ql_benchmark PROPERTIES OUTPUT_NAME "quantlib-benchmark")
     set_source_files_properties(quantlibbenchmark.cpp PROPERTIES SKIP_UNITY_BUILD_INCLUSION true)
diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index ada54d7cb32..550242e4c0f 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -65,15 +65,15 @@
 #include <chrono>
 #include <thread>
 
-
 /* initialize PAPI on Linux
   sudo sysctl -w kernel.perf_event_paranoid=0
   export PAPI_EVENTS="PAPI_TOT_INS,PAPI_FP_OPS,PAPI_FP_INS"
   export PAPI_REPORT=1
 */
-//#include <papi.h>
 
 
+// #include <papi.h>
+
 /* Use BOOST_MSVC instead of _MSC_VER since some other vendors (Metrowerks,
    for example) also #define _MSC_VER
 */
@@ -83,118 +83,7 @@
 
 #include "utilities.hpp"
 
-namespace QuantLibTests {
-
-    namespace AmericanOptionTests {
-        struct testFdAmericanGreeks:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace AsianOptionTests {
-        struct testMCDiscreteArithmeticAveragePrice:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace BarrierOptionTests {
-        struct testBabsiriValues:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace BasketOptionTests {
-        struct testEuroTwoValues:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-
-        struct testTavellaValues:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-
-        struct testOddSamples:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace BatesModelTests {
-        struct testDAXCalibration:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace ConvertibleBondTests {
-        struct testBond:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace DigitalOptionTests {
-        struct testMCCashAtHit:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace DividendOptionTests {
-        struct testFdEuropeanGreeks:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-
-        struct testFdAmericanGreeks:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace EuropeanOptionTests {
-        struct testMcEngines:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-
-        struct testImpliedVol:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-
-        struct testFdEngines:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace FdHestonTests {
-        struct testFdmHestonAmerican:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace HestonModelTests {
-        struct testDAXCalibration:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace InterpolationTests {
-        struct testSabrInterpolation:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace JumpDiffusionTests {
-        struct testGreeks:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace LowDiscrepancyTests {
-        struct testMersenneTwisterDiscrepancy:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace MarketModelCmsTests {
-        struct testMultiStepCmSwapsAndSwaptions:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace MarketModelSmmTests {
-        struct testMultiStepCoterminalSwapsAndSwaptions:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace QuantoOptionTests {
-        struct testForwardGreeks:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-
-    namespace RiskStatisticsTests {
-        struct testResults:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
 
-    namespace ShortRateModelTests {
-        struct testSwaps:
-            public BOOST_AUTO_TEST_CASE_FIXTURE { void test_method(); };
-    }
-}
 
 namespace {
 
@@ -224,46 +113,83 @@ namespace {
                        // point operations (not per sec!)
     };
 
-    std::vector<Benchmark> bm = {
-        Benchmark("AmericanOption::FdAmericanGreeks", [] { QuantLibTests::AmericanOptionTests::testFdAmericanGreeks().test_method(); }, 518.31),
-        Benchmark("AsianOption::MCArithmeticAveragePrice", [] { QuantLibTests::AsianOptionTests::testMCDiscreteArithmeticAveragePrice().test_method(); }, 5186.13),
-        Benchmark("BarrierOption::BabsiriValues", [] { QuantLibTests::BarrierOptionTests::testBabsiriValues().test_method(); }, 880.8),
-        Benchmark("BasketOption::EuroTwoValues", [] { QuantLibTests::BasketOptionTests::testEuroTwoValues().test_method(); }, 340.04),
-        Benchmark("BasketOption::EuroTwoValues", [] { QuantLibTests::BasketOptionTests::testTavellaValues().test_method(); }, 933.80),
-        Benchmark("BasketOption::EuroTwoValues", [] { QuantLibTests::BasketOptionTests::testOddSamples().test_method(); }, 642.46),
-        Benchmark("BatesModel::DAXCalibration", [] { QuantLibTests::BatesModelTests::testDAXCalibration().test_method(); }, 1993.35),
-        Benchmark("ConvertibleBondTest::testBond", [] { QuantLibTests::ConvertibleBondTests::testBond().test_method(); }, 159.85),
-        Benchmark("DigitalOption::MCCashAtHit", [] { QuantLibTests::DigitalOptionTests::testMCCashAtHit().test_method(); }, 995.87),
-        Benchmark("DividendOption::FdEuropeanGreeks", [] { QuantLibTests::DividendOptionTests::testFdEuropeanGreeks().test_method(); }, 949.52),
-        Benchmark("DividendOption::FdAmericanGreeks", [] { QuantLibTests::DividendOptionTests::testFdAmericanGreeks().test_method(); }, 1113.74),
-        Benchmark("EuropeanOption::FdMcEngines", [] { QuantLibTests::EuropeanOptionTests::testMcEngines().test_method(); }, 1988.63),
-        Benchmark("EuropeanOption::ImpliedVol", [] { QuantLibTests::EuropeanOptionTests::testImpliedVol().test_method(); }, 131.51),
-        Benchmark("EuropeanOption::FdEngines", [] { QuantLibTests::EuropeanOptionTests::testFdEngines().test_method(); }, 148.43),
-        Benchmark("FdHestonTest::testFdmHestonAmerican", [] { QuantLibTests::FdHestonTests::testFdmHestonAmerican().test_method(); }, 234.21),
-        Benchmark("HestonModel::DAXCalibration", [] { QuantLibTests::HestonModelTests::testDAXCalibration().test_method(); }, 555.19),
-        Benchmark("InterpolationTest::testSabrInterpolation", [] { QuantLibTests::InterpolationTests::testSabrInterpolation().test_method(); }, 295.63),
-        Benchmark("JumpDiffusion::Greeks", [] { QuantLibTests::JumpDiffusionTests::testGreeks().test_method(); }, 433.77),
-        Benchmark("MarketModelCmsTest::testCmSwapsSwaptions", [] { QuantLibTests::MarketModelCmsTests::testMultiStepCmSwapsAndSwaptions().test_method(); }, 11497.73),
-        Benchmark("MarketModelSmmTest::testMultiSmmSwaptions", [] { QuantLibTests::MarketModelSmmTests::testMultiStepCoterminalSwapsAndSwaptions().test_method(); }, 11244.95),
-        Benchmark("QuantoOption::ForwardGreeks", [] { QuantLibTests::QuantoOptionTests::testForwardGreeks().test_method(); }, 90.98),
-        Benchmark("RandomNumber::MersenneTwisterDescrepancy", [] { QuantLibTests::LowDiscrepancyTests::testMersenneTwisterDiscrepancy().test_method(); }, 951.98),
-        Benchmark("RiskStatistics::Results", [] { QuantLibTests::RiskStatisticsTests::testResults().test_method(); }, 300.28),
-        Benchmark("ShortRateModel::Swaps", [] { QuantLibTests::ShortRateModelTests::testSwaps().test_method(); }, 454.73)
+    std::vector<Benchmark> bm;
+
+    // A helper class to push benchmark objects into bm before main() starts
+    // Every time the constructor is called, a test is pushed into bm
+    struct AddBenchmark {
+        AddBenchmark() {}
+        template<class CALLABLE>
+        AddBenchmark(CALLABLE && body, const char* name, double cost) {
+            bm.push_back( Benchmark(name, std::move(body), cost) );
+        }
     };
 
+}
+
+#define QL_BENCHMARK_DECLARE(test_fixture, test_name, num_iters, cost)   \
+    namespace QuantLibTests {                                        \
+    namespace test_fixture {                                         \
+        struct test_name : public BOOST_AUTO_TEST_CASE_FIXTURE {     \
+            void test_method();                                      \
+        };                                                           \
+    }}                                                               \
+    \
+    namespace {             \
+        /* Declare unique global variable and push benchmark into bm */ \
+        AddBenchmark test_fixture##_##test_name( \
+                [] { for(int i=0; i<num_iters; i++) QuantLibTests::test_fixture::test_name().test_method(); }, \
+                #test_fixture "::" #test_name,                                  \
+                cost);                                             \
+    }
+
+
+
+QL_BENCHMARK_DECLARE(BatesModelTests, testDAXCalibration, 1, 1163.36);
+QL_BENCHMARK_DECLARE(HestonModelTests, testDAXCalibration, 1, 852.86);
+QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonAmerican, 1, 183.52);
+QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdAmericanGreeks, 1, 774.82);
+QL_BENCHMARK_DECLARE(EuropeanOptionTests, testImpliedVol, 1, 91.69);
+QL_BENCHMARK_DECLARE(HestonSLVModelTests, testMonteCarloCalibration, 1, 2395.90);
+QL_BENCHMARK_DECLARE(HestonSLVModelTests, testBarrierPricingViaHestonLocalVol, 1, 734.21);
+QL_BENCHMARK_DECLARE(MCLongstaffSchwartzEngineTests, testAmericanOption, 1, 1540.91);
+QL_BENCHMARK_DECLARE(VarianceGammaTests, testVarianceGamma, 1, 69.25);
+QL_BENCHMARK_DECLARE(ConvertibleBondTests, testBond, 1, 83.19);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testArbitrageFree, 1, 672.74);
+QL_BENCHMARK_DECLARE(ShortRateModelTests, testSwaps, 1, 75.51);
+QL_BENCHMARK_DECLARE(MarketModelCmsTests, testMultiStepCmSwapsAndSwaptions, 1, 10016.22);
+QL_BENCHMARK_DECLARE(MarketModelSmmTests, testMultiStepCoterminalSwapsAndSwaptions, 1, 9332.63);
+QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedG2Values, 1, 2189.44);
+QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testConvexMonotoneForwardConsistency, 10, 229.33);
+QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapWithArithmeticAverage, 10, 1084.21);
+QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationTwoInstrumentSets, 1, 1743.69);
+QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhite2, 100, 220.91);
+QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSpreadedCube, 10, 336.87);
+QL_BENCHMARK_DECLARE(ZabrTests, testConsistency, 1, 11913.76);
+QL_BENCHMARK_DECLARE(CmsSpreadTests, testCouponPricing, 1, 1184.0);
+QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testImpliedHazardRate, 1000, 227.2);
+QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpSwingOption, 1, 4329.34);
+QL_BENCHMARK_DECLARE(VppTests, testVPPPricing, 1, 3994.80);
+QL_BENCHMARK_DECLARE(RiskStatisticsTests, testResults, 1, 208.13);
+QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testMersenneTwisterDiscrepancy, 1, 487.65);
+QL_BENCHMARK_DECLARE(FdmLinearOpTests, testFdmMesherIntegral, 100, 4.2);
+QL_BENCHMARK_DECLARE(LinearLeastSquaresRegressionTests, testMultiDimRegression, 1, 81.78);
+
+
+namespace {
     class TimedBenchmark {
       public:
         TimedBenchmark(std::function<void(void)> f, std::string  name)
         : f_(std::move(f)), name_(std::move(name)) {}
 
         void startMeasurement() const {
-            //QL_REQUIRE(PAPI_hl_region_begin(name_.c_str()) == PAPI_OK,
-            //    "could not initialize PAPI");
+//            QL_REQUIRE(PAPI_hl_region_begin(name_.c_str()) == PAPI_OK,
+//                "could not initialize PAPI");
         }
 
         void stopMeasurement() const {
-            //QL_REQUIRE(PAPI_hl_region_end(name_.c_str()) == PAPI_OK,
-            //    "could not stop PAPI");
+//            QL_REQUIRE(PAPI_hl_region_end(name_.c_str()) == PAPI_OK,
+//                "could not stop PAPI");
         }
 
         double operator()() const {
@@ -287,9 +213,9 @@ namespace {
 
         const std::string header = "Benchmark Suite QuantLib "  QL_VERSION;
 
-        std::cout << std::endl << std::string(58,'-') << std::endl;
+        std::cout << std::endl << std::string(78,'-') << std::endl;
         std::cout << header << std::endl;
-        std::cout << std::string(58,'-') << std::endl << std::endl;
+        std::cout << std::string(78,'-') << std::endl << std::endl;
 
         std::sort(runTimes.begin(), runTimes.end(),
             [](const auto& a, const auto& b) {
@@ -317,15 +243,15 @@ namespace {
                     * nProc * std::get<1>(iterT);
 
             std::cout << std::get<0>(iterT).getName()
-                      << std::string(42-std::get<0>(iterT).getName().length(),' ')
+                      << std::string(62-std::get<0>(iterT).getName().length(),' ')
                       << ":" << std::fixed << std::setw(8) << std::setprecision(1)
                       << mflopsPerSec
                       << " mflops" << std::endl;
 
             sum+=mflopsPerSec;
         }
-        std::cout << std::string(58,'-') << std::endl
-                  << "QuantLib Benchmark Index                  :"
+        std::cout << std::string(78,'-') << std::endl
+                  << "QuantLib Benchmark Index" << std::string(38,' ') << ":"
                   << std::fixed << std::setw(8) << std::setprecision(1)
                   << sum/aggTimes.size()
                   << " mflops" << std::endl;

From 9581d6a44cefef909d1df8280b49ef059d21afe9 Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Thu, 2 Nov 2023 01:16:51 +0100
Subject: [PATCH 03/12] remove papi linker

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86cd7c62d98..1f96676aa0d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,7 +70,8 @@ option(QL_USE_STD_OPTIONAL "Use std::optional instead of boost::optional" OFF)
 option(QL_USE_STD_SHARED_PTR "Use standard smart pointers instead of Boost ones" OFF)
 option(QL_USE_STD_TUPLE "Use std::tuple instead of boost::tuple" ON)
 set(QL_EXTERNAL_SUBDIRECTORIES "" CACHE STRING "Optional list of external source directories to be added to the build (semicolon-separated)")
-set(QL_EXTRA_LINK_LIBRARIES "-lpapi" CACHE STRING "Optional extra link libraries to add to QuantLib")
+# set -lpapi here
+set(QL_EXTRA_LINK_LIBRARIES "" CACHE STRING "Optional extra link libraries to add to QuantLib")
 
 # Require C++14 or higher
 if (NOT DEFINED CMAKE_CXX_STANDARD)

From 12e62b432cdddcd913640d5dc25f88f9c7a60c17 Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Thu, 2 Nov 2023 18:41:54 +0100
Subject: [PATCH 04/12] adjust configure

---
 test-suite/Makefile.am           | 27 ++++++++++++++++++---------
 test-suite/quantlibbenchmark.cpp |  2 +-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/test-suite/Makefile.am b/test-suite/Makefile.am
index eebc8c21b13..c154cc4b93b 100644
--- a/test-suite/Makefile.am
+++ b/test-suite/Makefile.am
@@ -186,28 +186,37 @@ QL_TESTS = ${QL_TEST_SRCS} ${QL_TEST_HDRS}
 QL_BENCHMARK_SRCS = \
 	quantlibbenchmark.cpp \
 	americanoption.cpp \
-	asianoptions.cpp \
-	barrieroption.cpp \
-	doublebarrieroption.cpp \
-	basketoption.cpp \
+	andreasenhugevolatilityinterpl.cpp \
 	batesmodel.cpp \
+	bermudanswaption.cpp \
+	cdo.cpp \
+	cmsspread.cpp \
 	convertiblebonds.cpp \
-	digitaloption.cpp \
-	dividendoption.cpp \
+	creditdefaultswap.cpp \
 	europeanoption.cpp \
 	fdheston.cpp \
+	fdmlinearop.cpp \
 	hestonmodel.cpp \
-	interpolations.cpp \
-	jumpdiffusion.cpp \
+	hestonslvmodel.cpp \
+	linearleastsquaresregression.cpp \
 	lowdiscrepancysequences.cpp \
 	marketmodel_cms.cpp \
 	marketmodel_smm.cpp \
 	preconditions.cpp \
 	quantlibglobalfixture.cpp \
 	quantooption.cpp \
+	markovfunctional.cpp \
+	mclongstaffschwartzengine.cpp \
+	overnightindexedswap.cpp \
+	piecewiseyieldcurve.cpp \
 	riskstats.cpp \
 	shortratemodels.cpp \
-	utilities.cpp
+	swaptionvolatilitycube.cpp \
+	swingoption.cpp \
+	utilities.cpp \
+	variancegamma.cpp \
+	vpp.cpp \
+	zabr.cpp
 
 QL_BENCHMARK_HDRS = \
 	quantlibglobalfixture.hpp \
diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index 550242e4c0f..aa31fe7bf1e 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -344,7 +344,7 @@ int main(int argc, char* argv[] ) {
                 nProc*bm.size(), sizeof(unsigned)
             );
             message_queue rq(
-                open_or_create, testResultQueueName, 16, sizeof(result_type));
+                open_or_create, testResultQueueName, std::min(16u, nProc), sizeof(result_type));
 
             const std::vector<std::string> workerArgs(1, clientModeStr);
             std::vector<std::thread> threadGroup;

From 0259c40fb632513983cd631189ce8168cba7e3e7 Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Thu, 2 Nov 2023 21:22:28 +0100
Subject: [PATCH 05/12] added size parameter to benchmark suite

---
 test-suite/quantlibbenchmark.cpp | 48 +++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index aa31fe7bf1e..c000d98bb89 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -208,9 +208,12 @@ namespace {
     };
 
     void printResults(
-        unsigned nProc,
+        unsigned nProc, unsigned nSize,
         std::vector<std::pair<Benchmark, double> >& runTimes) {
 
+        QL_REQUIRE(runTimes.size() == nProc*nSize*bm.size(),
+            "inconsistent number of results");
+
         const std::string header = "Benchmark Suite QuantLib "  QL_VERSION;
 
         std::cout << std::endl << std::string(78,'-') << std::endl;
@@ -268,6 +271,7 @@ int main(int argc, char* argv[] ) {
     bool clientMode = false;
 
     unsigned nProc = 1;
+    unsigned nSize = 1;
     std::vector<std::pair<Benchmark, double> > runTimes;
 
     for (int i=1; i<argc; ++i) {
@@ -280,6 +284,20 @@ int main(int argc, char* argv[] ) {
                 ? boost::numeric_cast<unsigned>(std::stoul(tok[1]))
                 : std::thread::hardware_concurrency();
         }
+        else if (tok[0] == "--size") {
+            QL_REQUIRE(tok.size() == 2,
+                "benchmark size is not given. Should be one out of S, M, L or XL, default is S");
+            if (tok[1] == "S")
+                nSize = 1;
+            else if (tok[1] == "M")
+                nSize = 3;
+            else if (tok[1] == "L")
+                nSize = 5;
+            else if (tok[1] == "XL")
+                nSize = 20;
+            else
+                QL_FAIL("uknown benchmark size, Should be one out of S, M, L or XL");
+        }
         else if (arg == "--help" || arg == "-?") {
             std::cout
                 << "'quantlib-benchmark' is QuantLib " QL_VERSION " CPU performance benchmark"
@@ -292,6 +310,8 @@ int main(int argc, char* argv[] ) {
                 << "--mp[=PROCESSES] \t parallel execution with PROCESSES processes"
                 << std::endl
 #endif
+                << "--size=S|M|L|XL \t size of the benchmark"
+                << std::endl
                 << "-?, --help \t\t display this help and exit"
                 << std::endl;
             return 0;
@@ -309,12 +329,13 @@ int main(int argc, char* argv[] ) {
     }
 
     if (nProc == 1 && !clientMode) {
-        std::for_each(bm.begin(), bm.end(),
-            [&runTimes](const Benchmark& iter) {
-                runTimes.emplace_back(
-                    iter, TimedBenchmark(iter.getTestCase(), iter.getName())());
-        });
-        printResults(nProc, runTimes);
+        for (unsigned i=0; i < nSize; ++i)
+            std::for_each(bm.begin(), bm.end(),
+                [&runTimes](const Benchmark& iter) {
+                    runTimes.emplace_back(
+                        iter, TimedBenchmark(iter.getTestCase(), iter.getName())());
+            });
+        printResults(nProc, nSize, runTimes);
     }
     else {
 #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
@@ -341,10 +362,10 @@ int main(int argc, char* argv[] ) {
 
             message_queue mq(
                 open_or_create, testUnitIdQueueName,
-                nProc*bm.size(), sizeof(unsigned)
+                nProc*nSize*bm.size(), sizeof(unsigned)
             );
             message_queue rq(
-                open_or_create, testResultQueueName, std::min(16u, nProc), sizeof(result_type));
+                open_or_create, testResultQueueName, std::max(16u, nProc), sizeof(result_type));
 
             const std::vector<std::string> workerArgs(1, clientModeStr);
             std::vector<std::thread> threadGroup;
@@ -352,12 +373,13 @@ int main(int argc, char* argv[] ) {
                 threadGroup.emplace_back([&]() { worker(argv[0], workerArgs); });
             }
 
-            for (unsigned i=0; i < nProc; ++i)
-                for (unsigned j=0; j < bm.size(); ++j)
+            for (unsigned i=0; i < nProc*nSize; ++i)
+                for (unsigned j=0; j < bm.size(); ++j) {
                     mq.send(&j, sizeof(unsigned), 0);
+                }
 
             result_type r;
-            for (unsigned i = 0; i < nProc*bm.size(); ++i) {
+            for (unsigned i = 0; i < nProc*nSize*bm.size(); ++i) {
                 rq.receive(&r, sizeof(result_type), recvd_size, priority);
                 runTimes.push_back(std::make_pair(bm[r.first], r.second));
             }
@@ -367,7 +389,7 @@ int main(int argc, char* argv[] ) {
             for (auto& thread: threadGroup) {
                 thread.join();
             }
-            printResults(nProc, runTimes);
+            printResults(nProc, nSize, runTimes);
         }
         else {
             message_queue mq(open_only, testUnitIdQueueName);

From cdfa72d32be184218bb79edab1486bbc1a0c37eb Mon Sep 17 00:00:00 2001
From: klaus spanderen <klaus@spanderen.de>
Date: Thu, 2 Nov 2023 21:22:58 +0100
Subject: [PATCH 06/12] .

---
 test-suite/quantlibbenchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index c000d98bb89..87919bb1146 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -296,7 +296,7 @@ int main(int argc, char* argv[] ) {
             else if (tok[1] == "XL")
                 nSize = 20;
             else
-                QL_FAIL("uknown benchmark size, Should be one out of S, M, L or XL");
+                QL_FAIL("unknown benchmark size, Should be one out of S, M, L or XL");
         }
         else if (arg == "--help" || arg == "-?") {
             std::cout

From 600a0e0eb53dd578330efb6eab5041cf337662d4 Mon Sep 17 00:00:00 2001
From: Jacques du Toit <jadutoit@amd.com>
Date: Fri, 5 Jan 2024 09:39:42 +0000
Subject: [PATCH 07/12] Overhaul QuantLib benchmark

Many thanks to Klaus Spanderen for help and guidance on
this work.

QuantLib is one of the few open source production quality quant libraries.
As such, platforms such as Phoronix and others have used the QuantLib
benchmark as a measure of how well different CPUs run "financial workloads".
This is only meaningful in as much the QL benchmark resembles an overnight
risk run (this is the main computational workload that investment banks
select hardware for).

The original QL benchmark was sequential, which clearly is unrealistic.
In addition, it was rather light on more modern 2 and 3 factor models,
most of the tests it ran were rather short, it seemed not to have so much
in the line of Barriers or products using AMC, and it computed a score
based on FLOP counts.

For the problems with using FLOPS as the only system performance metric,
please see the HPCG benchmark and discussion on their website.

My experience is that wall time of a large risk run is a metric that many
investment banks can get behind when comparing hardware.  This suggests
that the benchmark should in some way resemble a large risk run.  We
can at best approximate this rather loosely, since the makeup of a risk run
is highly dependent on the products, models, numerical techniques, etc, that
an organisation has.  We've chosen to focus on the following main features:
 * Work is expressed as tasks
 * A large fixed-size hopper of tasks is given to the machine. The number of
   tasks is independent of the machine being tested
 * The tasks are not the same: they do differnt things, have differen runtimes,
   some are very short, some are long, they run different numerical techniques, etc
 * Tasks are all single threaded and all independent
 * The metric of performce is how quickly the system can complete all the tasks

This way, as long as we run the same number of tasks, the performance of machines
can be compared.

There is a potential tail effect here - the larger the number of tasks, the smaller
the tail effect. There is instrumentation to calculate the tail effect, in my testing,
for benchmark sizes S and up, the effect is small (less than 3%). I schedule the
longest-running tasks first (the approx. relative runtime of each task is hard-coded
into the benchmark).

The selection of work in each task is somewhat arbitrary.  We selected a broad range
of tests from the test suite, trying to cover as many different parts of the library
as we could. There is no right way to do this really.

It is also important to check that we get the right answer.  There is no point getting
the wrong answer faster.  Vendor compilers can make quite aggressive optimisations,
for example the Intel Compiler 2023 enables fast-math by default at O3, which causes
some tests to fail.

The boost test framework adds another layer of complexity.  If one "hacks" entry points
to the test functions by declaring their symbol names and calling them direclty,
then all the checking logic inside boost is disabled: no exceptions are raised for
failures, and it's impossible to know whether the test passed or failed.
Conversely, if one calls the boost test framework and asks it to
execute the tests (which enables all the checks and exceptions) then a substantial overhead
is introduced.

We worked around this by running each test exactly once through the boost framework, trapping
exceptions and terminating as needed.  All other executions of a test/task called the
symbols directly, bypassing the boost test framework.  This seems to me to give
a reasonable compromise.  Note that there is no way to reliably run
BOOST_AUTO_TEST_CASE_TEMPLATE tests without the boost framework, so these tests
are out of scope for the benchmark.

We also included extensive logging information, which was very useful in debugging performance
problems.

Lastly, it's important that people on small machines can run the benchmark with relative ease,
while on large machines the system is kept busy for around 2min or more.  We've approached this
through the --size parameter.  The recommended benchmark size for large machines (e.g.
dual socket servers with ~100 cores per socket) is S.  Benchmark sizes of M, L and above are
reserved for future growth.  Smaller systems should use benchmark sizes of XS, XXS or XXXS.

It is crucial, once this patch is merged, that these benchmark sizes are NOT CHANGED since that
will make it impossible to compare machines.  Machines can only be compared if they ran the
same workload, i.e they had the same number of tasks.  We can introduce more T-shirt sizes
on the front or end of the list, but the T-shirt sizes in the benchmark must remain fixed.
---
 test-suite/CMakeLists.txt        |    8 +
 test-suite/quantlibbenchmark.cpp | 1022 ++++++++++++++++++++++--------
 2 files changed, 761 insertions(+), 269 deletions(-)

diff --git a/test-suite/CMakeLists.txt b/test-suite/CMakeLists.txt
index 5f323526e53..6f453fe04e3 100644
--- a/test-suite/CMakeLists.txt
+++ b/test-suite/CMakeLists.txt
@@ -190,14 +190,19 @@ set(QL_BENCHMARK_SOURCES
     batesmodel.cpp                      
     bermudanswaption.cpp                
     cdo.cpp                             
+    cms.cpp          
     cmsspread.cpp                       
     convertiblebonds.cpp                
     creditdefaultswap.cpp               
     europeanoption.cpp                  
     fdheston.cpp                        
     fdmlinearop.cpp                     
+    functions.cpp     
+    gaussianquadratures.cpp   
     hestonmodel.cpp                     
     hestonslvmodel.cpp                  
+    interestrates.cpp        
+    libormarketmodel.cpp  
     linearleastsquaresregression.cpp   
     lowdiscrepancysequences.cpp         
     marketmodel_cms.cpp                 
@@ -207,7 +212,10 @@ set(QL_BENCHMARK_SOURCES
     overnightindexedswap.cpp            
     piecewiseyieldcurve.cpp             
     riskstats.cpp                       
+    rounding.cpp                        
     shortratemodels.cpp                 
+    stats.cpp    
+    squarerootclvmodel.cpp              
     swaptionvolatilitycube.cpp          
     swingoption.cpp                     
     preconditions.cpp                   preconditions.hpp
diff --git a/test-suite/quantlibbenchmark.cpp b/test-suite/quantlibbenchmark.cpp
index 87919bb1146..5369cd0a8c2 100644
--- a/test-suite/quantlibbenchmark.cpp
+++ b/test-suite/quantlibbenchmark.cpp
@@ -22,22 +22,16 @@
  QuantLib Benchmark Suite
 
  Measures the performance of a preselected set of numerically intensive
- test cases. The overall QuantLib Benchmark Index is given by the average
- performance in mflops. This benchmarks supports multiprocessing, e.g.
+ test cases. This benchmarks supports multiprocessing, e.g.
 
- Single process benchmark:
- ./quantlib-benchmark
+ Single process benchmark for testing:
+ ./quantlib-benchmark --size=1 --nProc=1 
 
- Benchmark with 16 processes:
- ./quantlib-benchmark --mp=16
+ Benchmark with 16 processes and the default size:
+ ./quantlib-benchmark --nProc=16
 
- Benchmark with one process per core
- ./quantlib-benchmark --mp
-
- The number of floating point operations of a given test case was measured
- using PAPI, http://icl.cs.utk.edu/papi
-
- Example results can be found at https://openbenchmarking.org/test/pts/quantlib
+ Benchmark with one worker process per hardware thread and the default size:
+ ./quantlib-benchmark 
 
  This benchmark is derived from quantlibtestsuite.cpp. Please see the
  copyrights therein.
@@ -51,11 +45,14 @@
 #include <boost/interprocess/ipc/message_queue.hpp>
 #endif
 
-#define BOOST_TEST_NO_MAIN 1
+#define BOOST_TEST_NO_MAIN 
+#define BOOST_TEST_ALTERNATIVE_INIT_API 
 #include <boost/test/included/unit_test.hpp>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/test/unit_test_suite.hpp>
+#include <boost/test/framework.hpp>
 
 #include <iomanip>
 #include <iostream>
@@ -65,15 +62,8 @@
 #include <chrono>
 #include <thread>
 
-/* initialize PAPI on Linux
-  sudo sysctl -w kernel.perf_event_paranoid=0
-  export PAPI_EVENTS="PAPI_TOT_INS,PAPI_FP_OPS,PAPI_FP_INS"
-  export PAPI_REPORT=1
-*/
 
 
-// #include <papi.h>
-
 /* Use BOOST_MSVC instead of _MSC_VER since some other vendors (Metrowerks,
    for example) also #define _MSC_VER
 */
@@ -85,234 +75,601 @@
 
 
 
-namespace {
 
-    class Benchmark {
-      public:
-        Benchmark(std::string name, std::function<void(void)> f, double mflop)
-        : f_(std::move(f)), name_(std::move(name)), mflop_(mflop) {}
+namespace {
 
-        std::function<void(void)> getTestCase() const {
-            return f_;
-        }
-        double getMflop() const {
-            return mflop_;
-        }
-        std::string getName() const {
-            return name_;
-        }
-        void swap(Benchmark& other) {
-            std::swap(f_, other.f_);
-            std::swap(name_, other.name_);
-            std::swap(mflop_, other.mflop_);
-        }
-      private:
-        std::function<void(void)> f_;
-        std::string name_;
-        double mflop_; // total number of mega floating
-                       // point operations (not per sec!)
-    };
+    /**
+     * A class representing an individual benchmark.  Each benchmark is one of the QuantLib
+     * test-suite tests, run one or more times.  Boost unit test framework causes a dilemma:
+     *
+     *    * if we don't use boost::unit_test::framework::run to run the test, then all the 
+     *       correcness checks are disabled.  We can't validate that the test passed.
+     *    * if we do use boost::unit_test::framework::run, then we incur a very large overhead
+     *       especially for short tests that are run many thousands of times.
+     *
+     * We deal with this by running each test exactly once using boost::unit_test::framework::run.
+     * Failures are marked using a boost::unit_test::test_observer and cause immediate tear down
+     * of the benchmark master process.  All subsequent runs of the test are done through a hack.
+     * We copy the declarations of the BOOST_AUTO_TEST_CASE and friends macros in boost/test/unit_test_suite.hpp
+     * to declare the symbols that Boost creates.  This allows us to call these symbols directly, 
+     * by-passing the boost unit test framework completely.
+     *
+     * The overall benchmark is parallelised using Boost::IPC.  QuantLib is not thread safe, so any
+     * kind of shared memory paralellism is ruled out.  The benchmark creates a large (fixed) amount of
+     * work, distributes this between all the workers, and sees how quickly the workers can finish it all.
+     * The overall metric is #tasks/s that the system can process.  The tasks are pre-set (these are the
+     * tests from the test-suite), and the --size argument to the benchmark controls how many times the
+     * entire set of tasks is executed. Once the machine is saturated with work the benchmark typically 
+     * exhibits perfect weak scaling: doubling --size will double runtime and leave #tasks/s unchanged.
+     * The #tasks/s will typically increase as the machine is given more work to do.
+     *
+     * The pre-set benchmark sizes are chosen to saturate even very large machines.
+     */
+    class Benchmark 
+    {
+        public:
+            template<class CALLABLE>
+                Benchmark(
+                        std::string name,               // the test name, as known by boost::unit_test::test_unit
+                        CALLABLE &&body,                // the "body" of the test we want to run
+                        double cost                     // how expensive (runtime) this test is relative to others
+                        )
+                : name_(std::move(name)), test_(nullptr), cost_(cost), totalRuntime_(0), testBody_(std::forward<CALLABLE>(body)) {}
+
+            Benchmark(const Benchmark& copy) = default;        
+            Benchmark(Benchmark&& move) = default;        
+            Benchmark& operator=(const Benchmark &other) = default;
+            Benchmark& operator=(Benchmark &&other) = default;
+
+            double getCost() const          { return cost_; }
+            std::string getName() const     { return name_; }
+            bool foundTestUnit() const      { return test_ != nullptr; }
+            // Total runtime across multiple runs is manually accumulated into the class
+            double& getTotalRuntime()       { return totalRuntime_; }
+            const double& getTotalRuntime() const { return totalRuntime_; }
+            void setTestUnit(const boost::unit_test::test_unit * unit) { test_ = unit; }
+
+
+            // Run the underlying QuantLib test exactly once using the Boost test framework
+            // This will check all results and will flag any errors that are found.  It is much
+            // slower than running just the test body outside of the Boost framework
+            double runValidation() const 
+            {                      
+                double time = -1.0;
+                try {
+                    auto startTime = std::chrono::steady_clock::now();  
+                    boost::unit_test::framework::run(test_, false);
+                    auto stopTime = std::chrono::steady_clock::now();
+                    time = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
+                } 
+                catch(const std::exception &e) {
+                    std::cerr << "error: caught exception in benchmark " << getName() << "\n"
+                        << "message: " << e.what() << "\n" << std::endl;                
+                }
+                catch(...) {
+                    std::cerr << "error: caught unknown exception in benchmark " << getName() << std::endl;                
+                }
+                return time;
+            }
 
-    std::vector<Benchmark> bm;
+            // Directly run the body of the underlying QuantLib test (multiple times) without using the Boost
+            // test framework. This eliminates all the boost overhead, but also disables all results checking.
+            double runBenchmark() const 
+            {                      
+                double time = -1.0;
+                try {
+                    auto startTime = std::chrono::steady_clock::now();  
+                    testBody_();
+                    auto stopTime = std::chrono::steady_clock::now();
+                    time = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
+                } 
+                catch(const std::exception &e) {
+                    std::cerr << "Error: caught exception in benchmark " << getName() << "\n"
+                        << "Message: " << e.what() << "\n" << std::endl;                
+                }
+                catch(...) {
+                    std::cerr << "Error: caught unknown exception in benchmark " << getName() << std::endl;                
+                }
+                return time;
+            }
 
-    // A helper class to push benchmark objects into bm before main() starts
-    // Every time the constructor is called, a test is pushed into bm
-    struct AddBenchmark {
-        AddBenchmark() {}
-        template<class CALLABLE>
-        AddBenchmark(CALLABLE && body, const char* name, double cost) {
-            bm.push_back( Benchmark(name, std::move(body), cost) );
-        }
+        private:
+            std::string name_;
+            const boost::unit_test::test_unit * test_;
+            double cost_; 
+            double totalRuntime_;
+            std::function<void(void)> testBody_;
     };
 
-}
 
-#define QL_BENCHMARK_DECLARE(test_fixture, test_name, num_iters, cost)   \
-    namespace QuantLibTests {                                        \
-    namespace test_fixture {                                         \
-        struct test_name : public BOOST_AUTO_TEST_CASE_FIXTURE {     \
-            void test_method();                                      \
-        };                                                           \
-    }}                                                               \
-    \
-    namespace {             \
-        /* Declare unique global variable and push benchmark into bm */ \
-        AddBenchmark test_fixture##_##test_name( \
-                [] { for(int i=0; i<num_iters; i++) QuantLibTests::test_fixture::test_name().test_method(); }, \
-                #test_fixture "::" #test_name,                                  \
-                cost);                                             \
-    }
+    /**
+     * To determine programmatically whether a test has passed or not, Boost unit test framework requires
+     * us to register a test observer class. This only gives the pass/fail status for the most recently
+     * run test, not even the name of the test that was run.  Hence we need some additional 
+     * plumbing to ensure that intra-test failures are not overridden by intra-test passes
+     * (for a test that has multiple calls to BOOST_CHECK or BOOST_FAIL). 
+     */
+    struct BenchmarkResult : public boost::unit_test::test_observer
+    {
+        public: 
+            BenchmarkResult() : passed_(true) {
+                boost::unit_test::framework::register_observer(*this);
+            }
+            ~BenchmarkResult() {
+                boost::unit_test::framework::deregister_observer(*this);
+            }
+            BenchmarkResult(const BenchmarkResult&) = delete;
+            BenchmarkResult(BenchmarkResult&&) = delete;
+            BenchmarkResult& operator=(const BenchmarkResult &) = delete;
+            BenchmarkResult& operator=(BenchmarkResult &&) = delete;
 
 
+            void assertion_result( boost::unit_test::assertion_result  ar ) override 
+            {
+                passed_ = passed_ && (ar == boost::unit_test::AR_PASSED);
+            }
+            bool pass() const { return passed_; }
+            void reset() { passed_ = true; }
 
-QL_BENCHMARK_DECLARE(BatesModelTests, testDAXCalibration, 1, 1163.36);
-QL_BENCHMARK_DECLARE(HestonModelTests, testDAXCalibration, 1, 852.86);
-QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonAmerican, 1, 183.52);
-QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdAmericanGreeks, 1, 774.82);
-QL_BENCHMARK_DECLARE(EuropeanOptionTests, testImpliedVol, 1, 91.69);
-QL_BENCHMARK_DECLARE(HestonSLVModelTests, testMonteCarloCalibration, 1, 2395.90);
-QL_BENCHMARK_DECLARE(HestonSLVModelTests, testBarrierPricingViaHestonLocalVol, 1, 734.21);
-QL_BENCHMARK_DECLARE(MCLongstaffSchwartzEngineTests, testAmericanOption, 1, 1540.91);
-QL_BENCHMARK_DECLARE(VarianceGammaTests, testVarianceGamma, 1, 69.25);
-QL_BENCHMARK_DECLARE(ConvertibleBondTests, testBond, 1, 83.19);
-QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testArbitrageFree, 1, 672.74);
-QL_BENCHMARK_DECLARE(ShortRateModelTests, testSwaps, 1, 75.51);
-QL_BENCHMARK_DECLARE(MarketModelCmsTests, testMultiStepCmSwapsAndSwaptions, 1, 10016.22);
-QL_BENCHMARK_DECLARE(MarketModelSmmTests, testMultiStepCoterminalSwapsAndSwaptions, 1, 9332.63);
-QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedG2Values, 1, 2189.44);
-QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testConvexMonotoneForwardConsistency, 10, 229.33);
-QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapWithArithmeticAverage, 10, 1084.21);
-QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationTwoInstrumentSets, 1, 1743.69);
-QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhite2, 100, 220.91);
-QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSpreadedCube, 10, 336.87);
-QL_BENCHMARK_DECLARE(ZabrTests, testConsistency, 1, 11913.76);
-QL_BENCHMARK_DECLARE(CmsSpreadTests, testCouponPricing, 1, 1184.0);
-QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testImpliedHazardRate, 1000, 227.2);
-QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpSwingOption, 1, 4329.34);
-QL_BENCHMARK_DECLARE(VppTests, testVPPPricing, 1, 3994.80);
-QL_BENCHMARK_DECLARE(RiskStatisticsTests, testResults, 1, 208.13);
-QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testMersenneTwisterDiscrepancy, 1, 487.65);
-QL_BENCHMARK_DECLARE(FdmLinearOpTests, testFdmMesherIntegral, 100, 4.2);
-QL_BENCHMARK_DECLARE(LinearLeastSquaresRegressionTests, testMultiDimRegression, 1, 81.78);
+        private:
+            bool passed_;
+    };
 
 
-namespace {
-    class TimedBenchmark {
-      public:
-        TimedBenchmark(std::function<void(void)> f, std::string  name)
-        : f_(std::move(f)), name_(std::move(name)) {}
-
-        void startMeasurement() const {
-//            QL_REQUIRE(PAPI_hl_region_begin(name_.c_str()) == PAPI_OK,
-//                "could not initialize PAPI");
-        }
+    /**
+     * This class takes a list of Benchmarks and attempts to find the corresponding 
+     * test_units in the Boost test unit tree.  
+     * */
+    class TestUnitFinder : public boost::unit_test::test_tree_visitor
+    {
+        private:
+            TestUnitFinder(std::vector<Benchmark> & bm) : bm_(bm) {}
+
+            // Utility method needed for initialising the Boost test framework
+            static bool init_unit_test_suite() { return true; }
+
+        public:
+            bool visit(const boost::unit_test::test_unit & tu) override
+            {
+                const std::string& thisTest = tu.full_name();
+                // Try find this in the bm array.  We know every test name sill start with
+                //   "QuantLibTests/"  which contains 14 characters
+                for(auto &b : bm_ ) {
+                    if( thisTest.find( b.getName(), 14) != std::string::npos ) {
+                        // We have a match
+                        b.setTestUnit( &tu );
+                    }
+                }
+                // Continue visiting
+                return true;
+            }
 
-        void stopMeasurement() const {
-//            QL_REQUIRE(PAPI_hl_region_end(name_.c_str()) == PAPI_OK,
-//                "could not stop PAPI");
-        }
 
-        double operator()() const {
-            startMeasurement();
-            auto startTime = std::chrono::steady_clock::now();
-            BOOST_CHECK(true); // to prevent no-assertion warning
-            f_();
-            auto stopTime = std::chrono::steady_clock::now();
-            stopMeasurement();
-            return std::chrono::duration_cast<std::chrono::microseconds>(
-                 stopTime - startTime).count() * 1e-6;
-        }
-      private:
-        std::function<void(void)> f_;
-        const std::string name_;
+            // Find the corresponding Boost test_unit for each Benchmark
+            // If we can't find a test_unit, throw an exception
+            static void findAllTests(char** argv, std::vector<Benchmark> &bm)
+            {
+                boost::unit_test::framework::init(TestUnitFinder::init_unit_test_suite, 1, argv);
+                boost::unit_test_framework::framework::finalize_setup_phase();
+
+                TestUnitFinder tuf(bm);
+                boost::unit_test::traverse_test_tree(boost::unit_test_framework::framework::master_test_suite(), tuf, true);
+
+                // Now check that we've found all test units
+                for(const auto &b : bm)  {
+                    if( !b.foundTestUnit() ) {
+                        std::string msg = "Unable to find the Boost test unit for Benchmark '";
+                        msg += b.getName();
+                        msg += "'";
+                        std::runtime_error err(msg);
+                        throw err;
+                    }
+                }
+            }
+
+        private:
+            std::vector<Benchmark> & bm_;
     };
 
-    void printResults(
-        unsigned nProc, unsigned nSize,
-        std::vector<std::pair<Benchmark, double> >& runTimes) {
 
-        QL_REQUIRE(runTimes.size() == nProc*nSize*bm.size(),
-            "inconsistent number of results");
+    // The container holding all the benchmarks we will run
+    std::vector<Benchmark> bm;
+
+
+
+    /**
+     * A clas to group and tidy up all the benchmark IO and boilerplate routines
+     */
+    struct BenchmarkSupport
+    {
+        // Verbosity level and a logging macro to help debugging
+        static int verbose;   
+#define LOG_MESSAGE(...)  if(BenchmarkSupport::verbose >= 3) { std::cout << __VA_ARGS__ << std::endl; }
 
-        const std::string header = "Benchmark Suite QuantLib "  QL_VERSION;
 
-        std::cout << std::endl << std::string(78,'-') << std::endl;
-        std::cout << header << std::endl;
-        std::cout << std::string(78,'-') << std::endl << std::endl;
+        // The set of pre-defined benchmark sizes that we support
+        static const std::vector< std::pair<std::string, unsigned int> > bmSizes;
 
-        std::sort(runTimes.begin(), runTimes.end(),
-            [](const auto& a, const auto& b) {
-                return a.first.getName() < b.first.getName();
+        // Turn a command line '--size=<value>' string into a benchmark size
+        static unsigned int parseBmSize(const std::string &size)
+        {
+            for(const auto & p : bmSizes) {
+                if(p.first == size)
+                    return p.second;
             }
-        );
-
-        std::vector<std::tuple<Benchmark, int, double> > aggTimes;
-        for (const auto& iter: runTimes) {
-            if (aggTimes.empty()
-                    || std::get<0>(aggTimes.back()).getName()
-                        != iter.first.getName()) {
-                aggTimes.emplace_back(iter.first, 1, iter.second);
+            // OK - it's not a preset size, let's see if it's parsable as an integer
+            try {
+                unsigned int sz = std::stoul(size);
+                return sz;
+            } 
+            catch(const std::exception &e) {
+                // Unable to convert to integer.  Abort
+                std::cerr << "Error: INVALID BENCHMARK RUN\n";
+                std::cerr << "Invalid custom benchmark size specified, unable to convert to an integer\n";
+                std::cerr << "Exception generated: " << e.what() << "\n";
+                exit(1);
             }
-            else {
-                ++std::get<1>(aggTimes.back());
-                std::get<2>(aggTimes.back()) += iter.second;
+        }
+
+        // Turn a benchmark size into a string for printing
+        static std::string bmSizeAsString(unsigned int size)
+        {
+            for(const auto& p : bmSizes) {
+                if(p.second == size)
+                    return p.first;
             }
+            // Not a preset size
+            return "Custom (" + std::to_string(size) + ")";
         }
 
-        double sum=0;
-        for (const auto& iterT: aggTimes) {
-            const double mflopsPerSec
-                = std::get<0>(iterT).getMflop() / std::get<2>(iterT)
-                    * nProc * std::get<1>(iterT);
 
-            std::cout << std::get<0>(iterT).getName()
-                      << std::string(62-std::get<0>(iterT).getName().length(),' ')
-                      << ":" << std::fixed << std::setw(8) << std::setprecision(1)
-                      << mflopsPerSec
-                      << " mflops" << std::endl;
+        static void printGreeting(const std::string &size, unsigned nProc)
+        {
+            std::cout << std::endl;
+            std::cout << std::string(84,'-') << "\n";
+            std::cout << "Benchmark Suite QuantLib "  QL_VERSION << "\n";
+            std::cout << "\n";
+            std::cout << "Benchmark size='" << size << "' on " << nProc << " processes\n";
+            std::cout << std::string(84,'-') << "\n";
+            std::cout << std::endl;        
+        }
 
-            sum+=mflopsPerSec;
+        // If a test fails, notify the user and terminate the benchmark
+        static void terminateBenchmark()
+        {
+            std::cerr << "\033[0m\nError: INVALID BENCHMARK RUN.\n"
+                <<  "One or more tests failed, please see the log for details" << std::endl ;
+            // Tear down the master process, which kills all child threads/processes
+            exit(1);
         }
-        std::cout << std::string(78,'-') << std::endl
-                  << "QuantLib Benchmark Index" << std::string(38,' ') << ":"
-                  << std::fixed << std::setw(8) << std::setprecision(1)
-                  << sum/aggTimes.size()
-                  << " mflops" << std::endl;
-    }
+
+
+        static void printResults(
+                unsigned nSize,                         // the size of the benchmark
+                double masterLifetime,                  // lifetime of the master process
+                std::vector<double> workerLifetimes     // lifetimes of all the worker processes
+                ) 
+        {
+            std::cout     << "\033[0m\n";
+            std::cout     << "Benchmark Size        = " << BenchmarkSupport::bmSizeAsString(nSize) << std::endl;
+            std::cout     << "System Throughput     = " << (double(nSize) * bm.size() ) / masterLifetime << " tasks/s" << std::endl;
+            std::cout     << "Benchmark Runtime     = " << masterLifetime<< "s" << std::endl;
+
+            if(verbose >=1 ) 
+            {
+                const size_t nProc = workerLifetimes.size();
+                std::cout << "Num. Worker Processes = " << nProc << std::endl;            
+
+                // Work out tail effect.  We define "tail effect" as the ratio of the average (geomean) 
+                // tail lifetime, to the lifetime of the master process.  The cutoff for defining 
+                // the "tail" is arbitrary.  A ratio of 1 means no tail effect  (tail lifetime is same
+                // as lifetime of master process), a ratio near 0 means tail finished significantly 
+                // before master process
+                std::sort(workerLifetimes.begin(), workerLifetimes.end());        
+                const double thresh = 0.1;
+                int tail = (int)std::ceil(thresh * nProc);
+                double tailGeomean = 1.0;
+                for(int i=0; i<tail; i++) {
+                    tailGeomean *= workerLifetimes[i];
+                }
+                tailGeomean = std::pow(tailGeomean, 1.0/tail);
+                const double tailEffect = tailGeomean / masterLifetime;
+
+                std::cout << "Tail Effect Ratio     = " << tailEffect << std::endl;
+                std::cout << "                      =  Geomean( Shortest " << tail << " worker lifetimes )" << std::endl;
+                std::cout << "                      --------------------------------------------------------" << std::endl;
+                std::cout << "                                    Lifetime( Master process )" << std::endl;
+                std::cout << std::endl;
+            }
+
+            std::cout << std::string(84,'-') << std::endl;
+
+            if(verbose >= 2) {            
+                std::cout << "                       Total Runtime spent in each test " << std::endl;
+                std::cout << std::string(84,'-') << std::endl;
+
+                // Compute max test name length
+                size_t len = 0;
+                for (const auto & b : bm) { len = std::max(len, b.getName().length() ); }
+
+                for (const auto& b: bm) {
+                    std::cout << b.getName()
+                        << std::string(len+2 - b.getName().length(),' ')
+                        << ": " << b.getTotalRuntime()  << "s" << std::endl;
+                }
+                std::cout << std::string(84,'-') << std::endl;
+            }
+            std::cout << std::endl; 
+        }
+
+
 #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
-    int worker(const char* exe, const std::vector<std::string>& args) {
-        return boost::process::system(exe, boost::process::args=args);
-    }
+        // The entry point for the std::thread's that will be the workers
+        static int worker(const char * exe, const std::vector<std::string>& args) {        
+            return boost::process::system(exe, boost::process::args=args);
+        }
 #endif
-}
 
-int main(int argc, char* argv[] ) {
+        // A helper class to push benchmark objects into the benchmark container 
+        // before main() starts.  Every time the constructor is called, a test is added.
+        struct AddBenchmark {
+            template<class CALLABLE>
+                AddBenchmark(std::vector<Benchmark> &bm, CALLABLE && test_body, const char* name, double cost) {
+                    bm.push_back( Benchmark(name, std::move(test_body), cost) );
+                }
+        };
+    };
+    int BenchmarkSupport::verbose = 0;
+    const std::vector< std::pair<std::string, unsigned int> > BenchmarkSupport::bmSizes = {
+            {"XXS",  60},
+            {"XS",   120},
+            {"S",    240},
+            {"M",    480},
+            {"L",    960}
+        };
+
+
+    // The messages sent from workers to master across boost IPC queues
+    struct IPCResultMsg
+    {
+        unsigned bmId;              // the benchcmark that was run
+        unsigned threadId;          // the ID of the worker who ran it
+        double time;                // the runtime
+    };
+
+    // The messages sent from master to workers across boost IPC queues
+    struct IPCInstructionMsg
+    {
+        unsigned j = 0;             // the benchmark to run
+        bool validate = false;      // whether to run in validation mode or not
+    };
+
+
+
+}  // END anonymous namespace
+
+
+// These are pulled from boost/unit_test/unit_test_suite.hpp.  We declare the
+// bodies of the tests so that we can run them more efficiently.
+#define QL_BENCHMARK_DECLARE(test_fixture, test_name, num_iters, cost)   \
+    namespace QuantLibTests {                                        \
+        namespace test_fixture {                                         \
+            struct test_name : public BOOST_AUTO_TEST_CASE_FIXTURE {     \
+                void test_method();                                      \
+            };                                                           \
+        }}                                                               \
+        \
+        namespace {             \
+            /* Declare unique global variable and push benchmark into bm */ \
+            BenchmarkSupport::AddBenchmark test_fixture##_##test_name( \
+                    bm, \
+                    [] { QuantLibTests::test_fixture::test_name thetest; for(int i=0; i<num_iters; i++) thetest.test_method(); }, \
+#test_fixture "/" #test_name, cost);                                             \
+        }
+
+
+// Set of all tests we will run.  The integer is the number of times the test is run, and
+// the value at the end is a relative runtime cost of each benchmark compared with the others.
+// Exact values are not needed, we just need to know what is "expensive" and what is "cheap" 
+// in terms of runtime.
+
+// Equity & FX
+QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdAmericanGreeks, 1, 0.5);
+QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdValues, 20, 3.0);
+QL_BENCHMARK_DECLARE(AmericanOptionTests, testCallPutParity, 100, 1.0);
+QL_BENCHMARK_DECLARE(AmericanOptionTests, testQdEngineStandardExample, 400, 0.5);
+QL_BENCHMARK_DECLARE(EuropeanOptionTests, testImpliedVol, 1, 0.5);
+QL_BENCHMARK_DECLARE(EuropeanOptionTests, testMcEngines, 1, 1.0);
+QL_BENCHMARK_DECLARE(EuropeanOptionTests, testLocalVolatility, 3, 2.0);
+QL_BENCHMARK_DECLARE(BatesModelTests, testDAXCalibration, 1, 0.5);
+QL_BENCHMARK_DECLARE(BatesModelTests, testAnalyticVsMCPricing, 1, 1.0);
+QL_BENCHMARK_DECLARE(BatesModelTests, testAnalyticAndMcVsJumpDiffusion, 5, 1.0);
+QL_BENCHMARK_DECLARE(HestonModelTests, testDAXCalibration, 1, 0.5);
+QL_BENCHMARK_DECLARE(HestonModelTests, testFdBarrierVsCached, 1, 3.0);
+QL_BENCHMARK_DECLARE(HestonModelTests, testFdAmerican, 1, 1.0);
+QL_BENCHMARK_DECLARE(HestonModelTests, testLocalVolFromHestonModel, 10, 1.0);
+QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonAmerican, 10, 1.0);
+QL_BENCHMARK_DECLARE(FdHestonTests, testAmericanCallPutParity, 15, 1.5);
+QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonBarrierVsBlackScholes, 1, 2.0);
+QL_BENCHMARK_DECLARE(HestonSLVModelTests, testMonteCarloCalibration, 1, 3.0);
+QL_BENCHMARK_DECLARE(HestonSLVModelTests, testHestonFokkerPlanckFwdEquation, 1, 5.0);
+QL_BENCHMARK_DECLARE(HestonSLVModelTests, testBarrierPricingViaHestonLocalVol, 1, 1.0);
+QL_BENCHMARK_DECLARE(MCLongstaffSchwartzEngineTests, testAmericanOption, 1, 2.0);
+QL_BENCHMARK_DECLARE(VarianceGammaTests, testVarianceGamma, 1, 0.1);
+QL_BENCHMARK_DECLARE(ConvertibleBondTests, testBond, 100, 2.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testArbitrageFree, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugeCallPut, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugeCall, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugePut, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testFlatVolCalibration, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testTimeDependentInterestRates, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testPiecewiseConstantInterpolation, 1, 1.0);
+QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testLinearInterpolation, 1, 1.0);
+
+// Interest Rates
+QL_BENCHMARK_DECLARE(ShortRateModelTests, testSwaps, 30, 3.0);
+QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhite2, 500, 1.0);
+QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhiteFixedReversion, 1000, 1.0);
+QL_BENCHMARK_DECLARE(MarketModelCmsTests, testMultiStepCmSwapsAndSwaptions, 1, 11.0);
+QL_BENCHMARK_DECLARE(MarketModelSmmTests, testMultiStepCoterminalSwapsAndSwaptions, 1, 9.0);
+QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedG2Values, 1, 2.0);
+QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedValues, 100, 3.0);
+QL_BENCHMARK_DECLARE(LiborMarketModelTests, testSwaptionPricing, 1, 1.0);
+QL_BENCHMARK_DECLARE(LiborMarketModelTests, testCalibration, 1, 5.0);
+QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testConvexMonotoneForwardConsistency, 10, 2.0);
+QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testFlatForwardConsistency, 50, 3.0);
+QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testGlobalBootstrap, 20, 2.0);
+QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapWithArithmeticAverage, 10, 5.0);
+QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBaseBootstrap, 10, 3.0);
+QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapRegression, 10, 1.0);
+QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationTwoInstrumentSets, 1, 3.0);
+QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationOneInstrumentSet, 1, 4.0);
+QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testVanillaEngines, 1, 7.0);
+QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testBermudanSwaption, 3, 1.0);
+QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSpreadedCube, 20, 1.0);
+QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSabrNormalVolatility, 1, 1.0);
+QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSabrVols, 30, 1.0);
+QL_BENCHMARK_DECLARE(ZabrTests, testConsistency, 1, 10.0);
+QL_BENCHMARK_DECLARE(CmsSpreadTests, testCouponPricing, 1, 1.0);
+QL_BENCHMARK_DECLARE(CmsTests, testCmsSwap, 20, 2.0);
+QL_BENCHMARK_DECLARE(CmsTests, testParity, 30, 2.0);
+QL_BENCHMARK_DECLARE(InterestRateTests, testConversions, 10000, 0.1);
+
+// Credit Derivatives
+QL_BENCHMARK_DECLARE(NthToDefaultTests, testGauss, 2, 14.0);
+QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testImpliedHazardRate, 1000, 1.0);
+QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testCachedMarketValue, 1000, 0.1);
+QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testIsdaEngine, 200, 2.0);
+QL_BENCHMARK_DECLARE(SquareRootCLVModelTests, testSquareRootCLVMappingFunction, 20, 0.5);
+QL_BENCHMARK_DECLARE(SquareRootCLVModelTests, testSquareRootCLVVanillaPricing, 200, 0.5);
+
+// Energy
+QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpSwingOption, 1, 3.0);
+QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpVanillaEngine, 1, 3.0);
+QL_BENCHMARK_DECLARE(SwingOptionTests, testFdBSSwingOption, 20, 1.0);
+QL_BENCHMARK_DECLARE(VppTests, testVPPPricing, 1, 5.0);
+QL_BENCHMARK_DECLARE(VppTests, testKlugeExtOUSpreadOption, 1, 1.0);
+
+// Math
+QL_BENCHMARK_DECLARE(RiskStatisticsTests, testResults, 4, 0.5);
+QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testMersenneTwisterDiscrepancy, 2, 0.5);
+QL_BENCHMARK_DECLARE(LinearLeastSquaresRegressionTests, testMultiDimRegression, 20, 2.0);
+QL_BENCHMARK_DECLARE(StatisticsTests, testIncrementalStatistics, 20, 0.5);
+QL_BENCHMARK_DECLARE(FunctionsTests, testFactorial, 1000, 0.1);
+QL_BENCHMARK_DECLARE(FunctionsTests, testGammaFunction, 1000, 0.5);
+QL_BENCHMARK_DECLARE(FunctionsTests, testGammaValues, 100000, 0.5);
+QL_BENCHMARK_DECLARE(FunctionsTests, testModifiedBesselFunctions, 10000, 0.5);
+QL_BENCHMARK_DECLARE(FunctionsTests, testWeightedModifiedBesselFunctions, 20, 0.5);
+QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testHalton, 80, 1.0);
+QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testNonCentralChiSquared, 4000, 0.5);
+QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testNonCentralChiSquaredSumOfNodes, 8000, 0.5);
+QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testMomentBasedGaussianPolynomial, 100000, 0.5);
+QL_BENCHMARK_DECLARE(RoundingTests, testCeiling, 100000, 0.1);
+QL_BENCHMARK_DECLARE(RoundingTests, testUp, 100000, 0.1);
+QL_BENCHMARK_DECLARE(RoundingTests, testFloor, 100000, 0.1);
+QL_BENCHMARK_DECLARE(RoundingTests, testDown, 100000, 0.1);
+QL_BENCHMARK_DECLARE(RoundingTests, testClosest, 100000, 0.1);
+
+
+
+
+int main(int argc, char* argv[] ) 
+{
     const std::string clientModeStr = "--client_mode=true";
     bool clientMode = false;
 
+    // Default number of worker processes to use
+#if defined(QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER)
+    unsigned nProc = std::thread::hardware_concurrency();
+#else
     unsigned nProc = 1;
-    unsigned nSize = 1;
-    std::vector<std::pair<Benchmark, double> > runTimes;
+#endif
+
+    // By default, run the smallest size we have.
+    std::string defaultSize = "3";
+    std::string size = defaultSize;
+
+    // A threadId is useful for debugging, but has no other purpose
+    unsigned threadId = 0;
 
+
+
+
+    ////  Argument handling  //////////////////////////
     for (int i=1; i<argc; ++i) {
         std::string arg = argv[i];
         std::vector<std::string> tok;
         boost::split(tok, arg, boost::is_any_of("="));
 
-        if (tok[0] == "--mp") {
-            nProc = (tok.size() == 2)
-                ? boost::numeric_cast<unsigned>(std::stoul(tok[1]))
-                : std::thread::hardware_concurrency();
+        if (tok[0] == "--nProc") {
+            QL_REQUIRE(tok.size() == 2, "Must provide a number of worker processes");
+            try {
+                nProc = boost::numeric_cast<unsigned>(std::stoul(tok[1]));
+            } catch(const std::exception &e) {
+                std::cerr << "Invalid argument to 'nProc', not a positive integer" << std::endl;
+                std::cerr << "Exception generated: " << e.what() << "\n";
+                exit(1);
+            }
+        }
+        else if (tok[0] == "--threadId") {
+            QL_REQUIRE(tok.size() == 2, "Must provide a threadId");
+            try {
+                threadId = boost::numeric_cast<unsigned>(std::stoul(tok[1]));                
+            } catch(const std::exception &e) {
+                std::cerr << "Invalid argument to 'threadId', not a positive integer. This is an internal error, please contact the developers" << std::endl;
+                std::cerr << "Exception generated: " << e.what() << "\n";
+                exit(1);
+            }
+        }
+        else if (tok[0] == "--verbose") {
+            QL_REQUIRE(tok.size() == 2, "Must provide a value for verbose");
+            try {
+                BenchmarkSupport::verbose = boost::numeric_cast<unsigned>(std::stoul(tok[1]));
+            } catch(const std::exception &e) {
+                std::cerr << "Invalid argument to 'verbose', not a positive integer" << std::endl;
+                std::cerr << "Exception generated: " << e.what() << "\n";
+                exit(1);
+            }
+            QL_REQUIRE(BenchmarkSupport::verbose>=0 && BenchmarkSupport::verbose <= 3, "Value for verbose must be 0, 1, 2 or 3");
         }
         else if (tok[0] == "--size") {
             QL_REQUIRE(tok.size() == 2,
-                "benchmark size is not given. Should be one out of S, M, L or XL, default is S");
-            if (tok[1] == "S")
-                nSize = 1;
-            else if (tok[1] == "M")
-                nSize = 3;
-            else if (tok[1] == "L")
-                nSize = 5;
-            else if (tok[1] == "XL")
-                nSize = 20;
-            else
-                QL_FAIL("unknown benchmark size, Should be one out of S, M, L or XL");
+                    "benchmark size is not given");
+            size = tok[1];
         }
-        else if (arg == "--help" || arg == "-?") {
+        else if (arg == "-h" || arg == "--help" || arg == "-?") {
             std::cout
-                << "'quantlib-benchmark' is QuantLib " QL_VERSION " CPU performance benchmark"
-                << std::endl << std::endl
-                << "Usage: ./quantlib-benchmark [OPTION]..."
-                << std::endl << std::endl
+                << "\n'quantlib-benchmark' is QuantLib " QL_VERSION " CPU performance benchmark\n"
+                << "\n"
+                << "You are strongly encouraged to run 'ulimit -n unlimited' before running this benchmark\n"
+                << "on Linux systems.  It uses Boost::IPC for parallelism, and a large number of file descriptors\n"
+                << "are needed to run this benchmark with a large number of worker processes.\n"
+                << "\n"
+                << "By default the benchmark uses a tiny size as a quick check that\n"
+                << "everything works.  To benchmark large systems a size of 'S' or larger\n"
+                << "should be used.\n"
+                << "\n"
+                << "Usage: ./quantlib-benchmark [OPTION] ...\n"
+                << "\n"
                 << "with the following options:"
-                << std::endl
+                << "\n"
 #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
-                << "--mp[=PROCESSES] \t parallel execution with PROCESSES processes"
-                << std::endl
+                << "--nProc[=NN]       \t parallel execution with NN worker processes.\n"
+                << "                   \t Default value is nProc=" << nProc << "\n"
+                << "\n"
 #endif
-                << "--size=S|M|L|XL \t size of the benchmark"
-                << std::endl
-                << "-?, --help \t\t display this help and exit"
+                << "--size=<";
+            for(const auto &p : BenchmarkSupport::bmSizes) {
+                std::cout << p.first << "|";
+            }
+            std::cout  << "NN> \n"
+                << "                   \t the size of the benchmark (how many times each \n"
+                << "                   \t task is run), where 'NN' can be any positive integer.\n"
+                << "                   \t Default vaue is size=" << defaultSize << "\n"
+                << "\n"
+                << "--verbose=<0|1|2|3>\t controls verbosity of output, default value is verbose=" << BenchmarkSupport::verbose << "\n"
+                << "\n"
+                << "-?, --help         \t display this help and exit"
                 << std::endl;
             return 0;
         }
@@ -328,87 +685,214 @@ int main(int argc, char* argv[] ) {
         }
     }
 
-    if (nProc == 1 && !clientMode) {
-        for (unsigned i=0; i < nSize; ++i)
-            std::for_each(bm.begin(), bm.end(),
-                [&runTimes](const Benchmark& iter) {
-                    runTimes.emplace_back(
-                        iter, TimedBenchmark(iter.getTestCase(), iter.getName())());
-            });
-        printResults(nProc, nSize, runTimes);
-    }
-    else {
-#ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
-        using namespace boost::interprocess;
-
-        typedef std::pair<unsigned, double> result_type;
-
-        message_queue::size_type recvd_size;
-        unsigned priority, terminateId=-1;
-
-        const char* const testUnitIdQueueName = "test_unit_queue";
-        const char* const testResultQueueName = "test_result_queue";
-
-        if (!clientMode) {
-            message_queue::remove(testUnitIdQueueName);
-            message_queue::remove(testResultQueueName);
-            struct queue_remove {
-                explicit queue_remove(const char* name) : name_(name) { }
-                ~queue_remove() { message_queue::remove(name_); }
-
-            private:
-                const char* const name_;
-            } remover1(testUnitIdQueueName),remover2(testResultQueueName);
-
-            message_queue mq(
-                open_or_create, testUnitIdQueueName,
-                nProc*nSize*bm.size(), sizeof(unsigned)
-            );
-            message_queue rq(
-                open_or_create, testResultQueueName, std::max(16u, nProc), sizeof(result_type));
-
-            const std::vector<std::string> workerArgs(1, clientModeStr);
-            std::vector<std::thread> threadGroup;
-            for (unsigned i = 0; i < nProc; ++i) {
-                threadGroup.emplace_back([&]() { worker(argv[0], workerArgs); });
-            }
+    const unsigned int nSize = BenchmarkSupport::parseBmSize(size);
+    std::vector<double> workerLifetimes;
 
-            for (unsigned i=0; i < nProc*nSize; ++i)
-                for (unsigned j=0; j < bm.size(); ++j) {
-                    mq.send(&j, sizeof(unsigned), 0);
-                }
+    ////////  Finished argument processing, start benchmark code   //////////////////////////////////////////////
 
-            result_type r;
-            for (unsigned i = 0; i < nProc*nSize*bm.size(); ++i) {
-                rq.receive(&r, sizeof(result_type), recvd_size, priority);
-                runTimes.push_back(std::make_pair(bm[r.first], r.second));
-            }
-            for (unsigned i=0; i < nProc; ++i) {
-                mq.send(&terminateId, sizeof(unsigned), 0);
-            }
-            for (auto& thread: threadGroup) {
-                thread.join();
+    try {
+
+        // Ensure we find the Boost test_unit for each benchmark
+        TestUnitFinder::findAllTests(argv, bm);
+
+        // To alleviate tail effects, we sort the bechmarks so that the most expensive ones are first.
+        // These will be the first to be dispatched to the OS scheduler
+        std::sort(bm.begin(), bm.end(),
+                [](const auto& a, const auto& b) { return a.getCost() > b.getCost(); });
+
+
+        BenchmarkResult bmResult;
+        if( !clientMode) 
+            BenchmarkSupport::printGreeting(size, nProc);
+
+
+
+        if (nProc == 1 && !clientMode) {        
+            // Sequential benchmark, useful for debugging
+            auto startTime = std::chrono::steady_clock::now();
+            for (unsigned i=0; i < nSize; ++i) {
+                for(unsigned int j=0; j<bm.size(); j++) {
+                    double time;
+                    // First run the validation for each benchmark
+                    if(i == 0) {
+                        bmResult.reset();
+                        time = bm[j].runValidation();
+                        if( !bmResult.pass() ) {
+                            BenchmarkSupport::terminateBenchmark();
+                        }
+                    }
+                    else {
+                        time = bm[j].runBenchmark();
+                    }
+                    bm[j].getTotalRuntime() += time;
+                    LOG_MESSAGE("MASTER  :  completed benchmarkId=" << j << ", time=" << time);              
+                }
             }
-            printResults(nProc, nSize, runTimes);
+            auto stopTime = std::chrono::steady_clock::now();            
+            double masterLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
+            workerLifetimes.push_back(masterLifetime);        
+            BenchmarkSupport::printResults(nSize, masterLifetime, workerLifetimes);
         }
         else {
-            message_queue mq(open_only, testUnitIdQueueName);
-            message_queue rq(open_only, testResultQueueName);
 
-            unsigned id=0;
-            mq.receive(&id, sizeof(unsigned), recvd_size, priority);
+#if defined(QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER)
+
+            using namespace boost::interprocess;
+
+            message_queue::size_type recvd_size;
+            unsigned int priority=0;
+            const unsigned int terminateId=-1;
+            const char* const testUnitIdQueueName = "test_unit_queue";
+            const char* const testResultQueueName = "test_result_queue";
+
+            if (!clientMode) {     
+
+                // Boost IPC message queue setup
+                message_queue::remove(testUnitIdQueueName);
+                message_queue::remove(testResultQueueName);
+                struct queue_remove {
+                    explicit queue_remove(const char* name) : name_(name) { }
+                    ~queue_remove() { message_queue::remove(name_); }
+
+                    private:
+                    const char* const name_;
+                } remover1(testUnitIdQueueName),remover2(testResultQueueName);
+
+                message_queue mq(
+                        open_or_create, testUnitIdQueueName,
+                        nSize*bm.size(), sizeof(IPCInstructionMsg)
+                        );
+                message_queue rq(                
+                        open_or_create, testResultQueueName,                 
+                        std::max(16u, nProc),                 
+                        sizeof(IPCResultMsg)
+                        );
+
+
+                // Start timer for the benchmark
+                auto startTime = std::chrono::steady_clock::now();
+
+                // Create the thread group and start each worker process, giving it a unique threadId (useful for debugging)
+                std::vector<std::thread> threadGroup;            
+                {
+                    std::string thread("--threadId="), verb("--verbose=");
+                    verb += std::to_string(BenchmarkSupport::verbose);
+                    std::vector<std::string> workerArgs = {clientModeStr, thread, verb};            
+                    for (unsigned i = 0; i < nProc; ++i) {
+                        LOG_MESSAGE("MASTER    : creating worker threadId=" << i+1);         
+                        workerArgs[1] = thread + std::to_string(i+1);                                  
+                        threadGroup.emplace_back([&,workerArgs]() { BenchmarkSupport::worker(argv[0], workerArgs); });                
+                    }
+                }
+
+                IPCInstructionMsg msg;
+                IPCResultMsg r;
+                // Fire off all the benchmarks
+                for (unsigned j=0; j < bm.size(); ++j) {
+                    // Enqueue nSize copies of each task to even out load balance
+                    for (unsigned i=0; i < nSize; ++i) {
+                        // Do validation for the first run of each benchmark
+                        msg = {j, (i==0)};
+                        // Will be non-blocking send since send buffer is big enough 
+                        LOG_MESSAGE("MASTER    : sending benchmarkId=" << msg.j << " with validation=" << msg.validate);                   
+                        mq.send(&msg, sizeof(IPCInstructionMsg), 0);
+                    }
+                }
+                // Receive all results from workers
+                for (unsigned i=0; i < nSize*bm.size(); ++i) {                
+                    rq.receive(&r, sizeof(IPCResultMsg), recvd_size, priority);
+                    LOG_MESSAGE("MASTER     : received result : threadId=" << r.threadId << ", benchmarkId=" << r.bmId 
+                            << ", time=" << r.time << " : " << nSize*bm.size()-1-i << " results pending");    
+                    if(r.time < 0) {
+                        // A benchmark test has failed
+                        BenchmarkSupport::terminateBenchmark();
+                    }               
+                    bm[r.bmId].getTotalRuntime() += r.time;                             
+                }
+
+
+                // Send terminate signal to all workers
+                for (unsigned i=0; i < nProc; ++i) {
+                    LOG_MESSAGE("MASTER    : sending TERMINATE signal");                   
+                    msg = {terminateId, false};
+                    mq.send(&msg, sizeof(IPCInstructionMsg), 0);
+                }
+                // Receive worker lifetimes
+                for (unsigned i=0; i < nProc; ++i) {                
+                    rq.receive(&r, sizeof(IPCResultMsg), recvd_size, priority);
+                    LOG_MESSAGE("MASTER    : received worker lifetime : threadId=" << r.threadId << ", time=" << r.time << " : " << nProc-1-i << " lifetimes pending");
+                    workerLifetimes.push_back(r.time);
+                }
+
+
+                // Synchronize with and exit all threads
+                for (auto& thread: threadGroup) {
+                    thread.join();
+                }
+
+                auto stopTime = std::chrono::steady_clock::now();            
+                double masterLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
+                BenchmarkSupport::printResults(nSize, masterLifetime, workerLifetimes);
 
-            while (id != terminateId) {
-                result_type a(id, TimedBenchmark(bm[id].getTestCase(), bm[id].getName())());
-                rq.send(&a, sizeof(result_type), 0);
 
-                mq.receive(&id, sizeof(unsigned), recvd_size, priority);
             }
-        }
+            else {
+                // We are a worker process - open Boost IPC queues
+                message_queue mq(open_only, testUnitIdQueueName);
+                message_queue rq(open_only, testResultQueueName);
+
+                // Record start of this process's lifetime.  We keep tack of lifetimes
+                // in order to monitor tail effects
+                auto startTime = std::chrono::steady_clock::now();
+                // If this worker has nothing to do, we still want a non-zero lifetime
+                auto stopTime = std::chrono::steady_clock::now();;
+
+                for(;;) {
+                    IPCInstructionMsg id;
+                    mq.receive(&id, sizeof(IPCInstructionMsg), recvd_size, priority);                
+
+                    if(id.j == terminateId) {
+                        // Worker process being told to terminate.  Report our lifetime.  
+                        // Lifetime is how long it took until we completed our final task                    
+                        double workerLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
+                        IPCResultMsg r {terminateId, threadId, workerLifetime};
+                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": received TERMINATE signal, sending lifetime=" << r.time);
+                        rq.send(&r, sizeof(IPCResultMsg), 0);
+                        break;
+                    }
+                    else {
+                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": received benchmarkId=" << id.j << ", validation=" << id.validate << ".  Starting execution ...");                    
+                        double time;
+                        if( id.validate ) {
+                            bmResult.reset();
+                            time = bm[id.j].runValidation();
+                            time = (bmResult.pass() ? time : -1.0);
+                        }
+                        else {
+                            time = bm[id.j].runBenchmark();
+                        }
+                        IPCResultMsg r {id.j, threadId, time};
+                        // We record the timestamp after each task is complete
+                        // We use this to define worker lifetime
+                        stopTime = std::chrono::steady_clock::now();
+                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": sending result benchmarkId=" << id.j << ", time=" << r.time);
+                        rq.send(&r, sizeof(IPCResultMsg), 0);
+                    }                              
+                }
+                LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": exiting");
+            }
+
 #else
-        std::cout << "Please compile QuantLib with option 'QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER'"
+            std::cout << "Please compile QuantLib with option 'QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER'"
                 " to run the benchmarks in parallel" << std::endl;
 #endif
+        }
+
+    } catch(const std::exception &e) {
+        if( !clientMode )
+            std::cerr << "MASTER process caught an exception:\n" << e.what() << std::endl;
+        else
+            std::cerr << "WORKER-" << std::setw(3) << threadId << " caught an exception:\n" << e.what() << std::endl;
     }
 
     return 0;

From b7f61945faa16a9ab1753d33663b2d77e654cfe1 Mon Sep 17 00:00:00 2001
From: Jacques du Toit <jadutoit@amd.com>
Date: Fri, 3 May 2024 13:05:27 +0000
Subject: [PATCH 08/12] Create a CMake Object library for the test-suite
 objects.

Use this to build the test-suite and benchmark executables.
This avoids having to do any additional compilation for the benchmark.
Instead, the same objects that were used to build the test suite are
used to build the benchmark.

This also more or less obviates the need for a CMake BUILD_BENCHMARK
variable.
---
 CMakeLists.txt            |  3 +-
 test-suite/CMakeLists.txt | 78 +++++++++------------------------------
 2 files changed, 18 insertions(+), 63 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f96676aa0d..f77d4c01d77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,6 @@ set(QL_INSTALL_CMAKEDIR "lib/cmake/${PACKAGE_NAME}" CACHE STRING
     "Installation directory for CMake scripts")
 
 # Options
-option(QL_BUILD_BENCHMARK "Build benchmark" ON)
 option(QL_BUILD_EXAMPLES "Build examples" ON)
 option(QL_BUILD_TEST_SUITE "Build test suite" ON)
 option(QL_BUILD_FUZZ_TEST_SUITE "Build fuzz test suite" OFF) 
@@ -270,7 +269,7 @@ add_subdirectory(ql)
 if (QL_BUILD_EXAMPLES)
     add_subdirectory(Examples)
 endif()
-if (QL_BUILD_TEST_SUITE OR QL_BUILD_BENCHMARK)
+if (QL_BUILD_TEST_SUITE)
     add_subdirectory(test-suite)
 endif()
 
diff --git a/test-suite/CMakeLists.txt b/test-suite/CMakeLists.txt
index 6f453fe04e3..a4b558cf7e7 100644
--- a/test-suite/CMakeLists.txt
+++ b/test-suite/CMakeLists.txt
@@ -129,7 +129,6 @@ set(QL_TEST_SOURCES
     preconditions.cpp
     prices.cpp
     quantlibglobalfixture.cpp
-    quantlibtestsuite.cpp
     quantooption.cpp
     quotes.cpp
     rangeaccrual.cpp
@@ -183,78 +182,35 @@ set(QL_TEST_HEADERS
     utilities.hpp
 )
 
-set(QL_BENCHMARK_SOURCES
-    quantlibbenchmark.cpp
-    americanoption.cpp                  
-    andreasenhugevolatilityinterpl.cpp  
-    batesmodel.cpp                      
-    bermudanswaption.cpp                
-    cdo.cpp                             
-    cms.cpp          
-    cmsspread.cpp                       
-    convertiblebonds.cpp                
-    creditdefaultswap.cpp               
-    europeanoption.cpp                  
-    fdheston.cpp                        
-    fdmlinearop.cpp                     
-    functions.cpp     
-    gaussianquadratures.cpp   
-    hestonmodel.cpp                     
-    hestonslvmodel.cpp                  
-    interestrates.cpp        
-    libormarketmodel.cpp  
-    linearleastsquaresregression.cpp   
-    lowdiscrepancysequences.cpp         
-    marketmodel_cms.cpp                 
-    marketmodel_smm.cpp                 
-    markovfunctional.cpp                
-    mclongstaffschwartzengine.cpp       
-    overnightindexedswap.cpp            
-    piecewiseyieldcurve.cpp             
-    riskstats.cpp                       
-    rounding.cpp                        
-    shortratemodels.cpp                 
-    stats.cpp    
-    squarerootclvmodel.cpp              
-    swaptionvolatilitycube.cpp          
-    swingoption.cpp                     
-    preconditions.cpp                   preconditions.hpp
-    quantlibglobalfixture.cpp			quantlibglobalfixture.hpp
-    utilities.cpp                       utilities.hpp
-    variancegamma.cpp                   
-    vpp.cpp                            
-    zabr.cpp                           
-)
 
 if (QL_BUILD_TEST_SUITE)
-    add_executable(ql_test_suite ${QL_TEST_SOURCES} ${QL_TEST_HEADERS})
-    set_target_properties(ql_test_suite PROPERTIES OUTPUT_NAME "quantlib-test-suite")
-    set_source_files_properties(quantlibtestsuite.cpp PROPERTIES SKIP_UNITY_BUILD_INCLUSION true)
-    target_link_libraries(ql_test_suite PRIVATE
-        ql_library
-        ${QL_THREAD_LIBRARIES})
+    add_library(ql_test OBJECT ${QL_TEST_SOURCES} ${QL_TEST_HEADERS})
+    if (NOT Boost_USE_STATIC_LIBS)
+        target_compile_definitions(ql_test PUBLIC BOOST_ALL_DYN_LINK)
+    endif()
     if(MSVC AND CMAKE_UNITY_BUILD)
         # for Unity builds, we need to add /bigobj
-        target_compile_options(ql_test_suite PRIVATE "/bigobj")
+        target_compile_options(ql_test PUBLIC "/bigobj")
     endif()
+    target_link_libraries(ql_test PUBLIC ql_library ${QL_THREAD_LIBRARIES})
+
+
+    add_executable(ql_test_suite quantlibtestsuite.cpp)
+    set_source_files_properties(quantlibtestsuite.cpp PROPERTIES SKIP_UNITY_BUILD_INCLUSION true)
+    set_target_properties(ql_test_suite PROPERTIES OUTPUT_NAME "quantlib-test-suite")
+    target_link_libraries(ql_test_suite PRIVATE ql_test) 
+
     if (QL_INSTALL_TEST_SUITE)
         install(TARGETS ql_test_suite RUNTIME DESTINATION ${QL_INSTALL_BINDIR})
     endif()
     add_test(NAME quantlib_test_suite COMMAND ql_test_suite --log_level=message)
-endif()
 
-if (QL_BUILD_BENCHMARK)
-    add_executable(ql_benchmark ${QL_BENCHMARK_SOURCES})
+
+    add_executable(ql_benchmark quantlibbenchmark.cpp)
     set_target_properties(ql_benchmark PROPERTIES OUTPUT_NAME "quantlib-benchmark")
-    set_source_files_properties(quantlibbenchmark.cpp PROPERTIES SKIP_UNITY_BUILD_INCLUSION true)
-    target_link_libraries(ql_benchmark PRIVATE
-        ql_library
-        ${QL_THREAD_LIBRARIES})
-    if(MSVC AND CMAKE_UNITY_BUILD)
-        # for Unity builds, we need to add /bigobj
-        target_compile_options(ql_benchmark PRIVATE "/bigobj")
-    endif()
+    target_link_libraries(ql_benchmark PRIVATE ql_test)
     if (QL_INSTALL_BENCHMARK)
         install(TARGETS ql_benchmark RUNTIME DESTINATION ${QL_INSTALL_BINDIR})
     endif()
 endif()
+

From e18ed0167b9d3c847128c36fce1ac8a4cb5b87a9 Mon Sep 17 00:00:00 2001
From: Jacques du Toit <jadutoit@amd.com>
Date: Tue, 14 May 2024 13:11:14 +0000
Subject: [PATCH 09/12] Port CMake changes into automake

---
 test-suite/Makefile.am | 74 +++++++-----------------------------------
 1 file changed, 11 insertions(+), 63 deletions(-)

diff --git a/test-suite/Makefile.am b/test-suite/Makefile.am
index c154cc4b93b..74627bde20f 100644
--- a/test-suite/Makefile.am
+++ b/test-suite/Makefile.am
@@ -1,6 +1,5 @@
 
 QL_TEST_SRCS = \
-	quantlibtestsuite.cpp \
 	americanoption.cpp \
 	amortizingbond.cpp \
 	andreasenhugevolatilityinterpl.cpp \
@@ -174,6 +173,7 @@ QL_TEST_SRCS = \
 	zabr.cpp \
 	zerocouponswap.cpp
 
+
 QL_TEST_HDRS = \
     preconditions.hpp \
 	quantlibglobalfixture.hpp \
@@ -183,47 +183,6 @@ QL_TEST_HDRS = \
 
 QL_TESTS = ${QL_TEST_SRCS} ${QL_TEST_HDRS}
 
-QL_BENCHMARK_SRCS = \
-	quantlibbenchmark.cpp \
-	americanoption.cpp \
-	andreasenhugevolatilityinterpl.cpp \
-	batesmodel.cpp \
-	bermudanswaption.cpp \
-	cdo.cpp \
-	cmsspread.cpp \
-	convertiblebonds.cpp \
-	creditdefaultswap.cpp \
-	europeanoption.cpp \
-	fdheston.cpp \
-	fdmlinearop.cpp \
-	hestonmodel.cpp \
-	hestonslvmodel.cpp \
-	linearleastsquaresregression.cpp \
-	lowdiscrepancysequences.cpp \
-	marketmodel_cms.cpp \
-	marketmodel_smm.cpp \
-	preconditions.cpp \
-	quantlibglobalfixture.cpp \
-	quantooption.cpp \
-	markovfunctional.cpp \
-	mclongstaffschwartzengine.cpp \
-	overnightindexedswap.cpp \
-	piecewiseyieldcurve.cpp \
-	riskstats.cpp \
-	shortratemodels.cpp \
-	swaptionvolatilitycube.cpp \
-	swingoption.cpp \
-	utilities.cpp \
-	variancegamma.cpp \
-	vpp.cpp \
-	zabr.cpp
-
-QL_BENCHMARK_HDRS = \
-	quantlibglobalfixture.hpp \
-	preconditions.hpp \
-	utilities.hpp
-
-QL_BENCHMARKS = ${QL_BENCHMARK_SRCS} ${QL_BENCHMARK_HDRS}
 
 dist-hook:
 	mkdir -p $(distdir)/build
@@ -243,9 +202,7 @@ endif
 
 if UNITY_BUILD
 
-nodist_quantlib_test_suite_SOURCES = unity_test.cpp
-
-unity_test.cpp: Makefile.am
+unity.cpp: Makefile.am
 	echo "/* This file is automatically generated; do not edit.     */" > $@
 	echo "/* Add the files to be included into Makefile.am instead. */" >> $@
 	echo >> $@
@@ -253,34 +210,25 @@ unity_test.cpp: Makefile.am
 		echo "#include \"test-suite/$$i\"" >> $@; \
 	done
 
-nodist_quantlib_benchmark_SOURCES = unity_benchmark.cpp quantlibbenchmark.cpp
-UNITY_SRC = $(filter-out quantlibbenchmark.cpp,$(QL_BENCHMARK_SRCS))
-
-unity_benchmark.cpp: Makefile.am
-	echo "/* This file is automatically generated; do not edit.     */" > $@
-	echo "/* Add the files to be included into Makefile.am instead. */" >> $@
-	echo >> $@
-	for i in $(UNITY_SRC); do \
-		echo "#include \"test-suite/$$i\"" >> $@; \
-	done
-
-EXTRA_DIST = $(QL_TESTS) $(QL_BENCHMARKS)
+quantlib_test_suite_SOURCES = unity.cpp
+quantlib_benchmark_SOURCES = unity.cpp
+EXTRA_DIST = $(QL_TESTS) 
 
 else
 
-quantlib_test_suite_SOURCES = $(QL_TESTS)
-
-quantlib_benchmark_SOURCES = $(QL_BENCHMARKS)
-
+quantlib_test_suite_SOURCES = $(QL_TESTS) 
+quantlib_benchmark_SOURCES = $(QL_TESTS) 
 EXTRA_DIST =
 
-endif
+endif 
 
+quantlib_test_suite_SOURCES += quantlibtestsuite.cpp 
 quantlib_test_suite_LDADD = ${top_builddir}/ql/libQuantLib.la \
                             ${PTHREAD_LIB} ${BOOST_INTERPROCESS_LIB}
 
+quantlib_benchmark_SOURCES += quantlibbenchmark.cpp 
 quantlib_benchmark_LDADD = ${top_builddir}/ql/libQuantLib.la \
-                           ${PTHREAD_LIB}
+                           ${PTHREAD_LIB} ${BOOST_INTERPROCESS_LIB}
 
 TESTS = quantlib-test-suite$(EXEEXT)
 TESTS_ENVIRONMENT = BOOST_TEST_LOG_LEVEL=message BOOST_TEST_COLOR_OUTPUT=false

From fec4e076b2ac9cc29ea163f9e33ad3ef34d0c155 Mon Sep 17 00:00:00 2001
From: Jacques du Toit <jadutoit@amd.com>
Date: Wed, 7 Feb 2024 09:13:03 +0000
Subject: [PATCH 10/12] Relax tolerance on AmericanOption test

The test tolerance is very strict and even small changes in floating point
order causes the test to fail (actual error is ~5e-15 rather than required 1e-15).
It is highly unlikely that failure in this case indicates invalid assembly.
This patch allows both AOCC with -O3 -zopt -amdveclib and ICPX with -O3 -xCORE-AVX512
-fp-model=precise to pass.
---
 test-suite/americanoption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-suite/americanoption.cpp b/test-suite/americanoption.cpp
index 13bd8185832..570563e0191 100644
--- a/test-suite/americanoption.cpp
+++ b/test-suite/americanoption.cpp
@@ -1542,7 +1542,7 @@ BOOST_AUTO_TEST_CASE(testQdEngineStandardExample) {
         );
         const Real calculated = americanOption.NPV() - europeanOption.NPV();
 
-        const Real tol = 1e-15;
+        const Real tol = 7e-15;
         const Real diff = std::abs(calculated - expected[i]);
 
         if (diff > tol) {

From 24a23869e4c3e7cf737ec9c629a0bd5959f6d36d Mon Sep 17 00:00:00 2001
From: Luigi Ballabio <luigi.ballabio@gmail.com>
Date: Thu, 16 May 2024 17:45:11 +0200
Subject: [PATCH 11/12] Fix "make dist"

---
 test-suite/Makefile.am | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/test-suite/Makefile.am b/test-suite/Makefile.am
index 74627bde20f..00bad8d2e31 100644
--- a/test-suite/Makefile.am
+++ b/test-suite/Makefile.am
@@ -210,23 +210,25 @@ unity.cpp: Makefile.am
 		echo "#include \"test-suite/$$i\"" >> $@; \
 	done
 
-quantlib_test_suite_SOURCES = unity.cpp
-quantlib_benchmark_SOURCES = unity.cpp
-EXTRA_DIST = $(QL_TESTS) 
+nodist_quantlib_test_suite_SOURCES = unity.cpp
+quantlib_test_suite_SOURCES = quantlibtestsuite.cpp 
+
+nodist_quantlib_benchmark_SOURCES = unity.cpp
+quantlib_benchmark_SOURCES = quantlibbenchmark.cpp 
+
+EXTRA_DIST = $(QL_TESTS)
 
 else
 
-quantlib_test_suite_SOURCES = $(QL_TESTS) 
-quantlib_benchmark_SOURCES = $(QL_TESTS) 
+quantlib_test_suite_SOURCES = $(QL_TESTS) quantlibtestsuite.cpp 
+quantlib_benchmark_SOURCES = $(QL_TESTS) quantlibbenchmark.cpp 
 EXTRA_DIST =
 
 endif 
 
-quantlib_test_suite_SOURCES += quantlibtestsuite.cpp 
 quantlib_test_suite_LDADD = ${top_builddir}/ql/libQuantLib.la \
                             ${PTHREAD_LIB} ${BOOST_INTERPROCESS_LIB}
 
-quantlib_benchmark_SOURCES += quantlibbenchmark.cpp 
 quantlib_benchmark_LDADD = ${top_builddir}/ql/libQuantLib.la \
                            ${PTHREAD_LIB} ${BOOST_INTERPROCESS_LIB}
 

From 56a4fbc05e0188aa168a572dbc160c62b1000304 Mon Sep 17 00:00:00 2001
From: Luigi Ballabio <luigi.ballabio@gmail.com>
Date: Fri, 17 May 2024 15:08:25 +0200
Subject: [PATCH 12/12] Run bencjmark in Linux CMake CI build

---
 .github/workflows/cmake.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index dafe942b20b..03eb2ec578c 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -29,6 +29,9 @@ jobs:
     - name: Test
       run: |
         quantlib-test-suite --log_level=message
+    - name: Run benchmark
+      run: |
+        quantlib-benchmark --size=1
   cmake-linux-with-options:
     runs-on: ubuntu-latest
     steps: