From 9f7b475efb3b6c050fa1483c9d46195708976ff1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 2 Apr 2024 16:30:03 -0700
Subject: [PATCH] update toplev to TMA 4.8

* toplev updated to TMA 4.8:
  * Bottlenecks View:
    * Renamed Base_Non_Br to Useful_Work and simplified descriptions for all BV metrics.
    * Cache_Memory_Latency now accounts for L1 cache latency as well.
    * Improved Branching_Overhead accuracy for function calling and alignments
    * Cross-reference Bottlenecks w/ TMA tree for tool visualization (VTune request)
  * New Tree Nodes
    * L1_Hit_Latency: estimates fraction of cycles with demand load accesses that hit the L1 cache (relies on Dependent_Loads_Weight SystemParameter today)
  * New Informative Metrics
    * Fetch_LSD (client), Fetch_DSB, Fetch_MITE under Info.Pipeline group [SKL onwards]
    * DSB_Bandwidth under Info.Botlnk.L2
    * L2MPKI_RFO under Info.Memory
  * Key Enhancements & fixes
    * Fixed Ports_Utilization/Ports_Utilized_0
    * Slightly tuned memory (fixed cost) latencies [SPR, EMR]
  * Corrected CPU_Utilization, CPUs_Utilized for Linux perf based tools
---
 README.md                |  17 +
 adl_glc_ratios.py        | 568 ++++++++++++++++---------
 bdw_client_ratios.py     | 172 ++++----
 bdx_server_ratios.py     | 172 ++++----
 clx_server_ratios.py     | 591 ++++++++++++++++----------
 hsw_client_ratios.py     | 121 +++---
 hsx_server_ratios.py     | 121 +++---
 icl_client_ratios.py     | 608 +++++++++++++++++----------
 icx_server_ratios.py     | 669 +++++++++++++++++++-----------
 ivb_client_ratios.py     | 138 +++---
 ivb_server_ratios.py     | 138 +++---
 jkt_server_ratios.py     |  52 ++-
 skl_client_ratios.py     | 563 ++++++++++++++++---------
 skx_server_ratios.py     | 555 ++++++++++++++++---------
 snb_client_ratios.py     |  52 ++-
 spr_max_server_ratios.py | 829 +++++++++++++++++++------------------
 spr_server_ratios.py     | 875 +++++++++++++++++++++------------------
 tl-tester                |  10 +-
 toplev.py                |   3 +
 19 files changed, 3746 insertions(+), 2508 deletions(-)

diff --git a/README.md b/README.md
index 343cd24b..ba17a669 100644
--- a/README.md
+++ b/README.md
@@ -120,6 +120,23 @@ on newer Linux kernels.
 
 # Recent new features:
 
+## TMA 4.8 release
+* toplev updated to TMA 4.8:
+  * Bottlenecks View:
+    * Renamed Base_Non_Br to Useful_Work and simplified descriptions for all BV metrics.
+    * Cache_Memory_Latency now accounts for L1 cache latency as well.
+    * Improved Branching_Overhead accuracy for function calling and alignments
+    * Cross-reference Bottlenecks w/ TMA tree for tool visualization (VTune request)
+  * New Tree Nodes
+    * L1_Hit_Latency: estimates fraction of cycles with demand load accesses that hit the L1 cache (relies on Dependent_Loads_Weight SystemParameter today)
+  * New Informative Metrics
+    * Fetch_LSD (client), Fetch_DSB, Fetch_MITE under Info.Pipeline group [SKL onwards]
+    * DSB_Bandwidth under Info.Botlnk.L2
+    * L2MPKI_RFO under Info.Memory
+  * Key Enhancements & fixes
+    * Fixed Ports_Utilization/Ports_Utilized_0
+    * Slightly tuned memory (fixed cost) latencies [SPR, EMR]
+  * Corrected CPU_Utilization, CPUs_Utilized for Linux perf based tools
 * toplev now supports Meteor Lake systems.
 * Add a new genretlat.py tool to tune the toplev model for a workload. The basic tuning needs to be
   generated before first toplev use using genretlat -o mtl-retlat.json ./workloads/BC1s (or suitable workload). toplev
diff --git a/adl_glc_ratios.py b/adl_glc_ratios.py
index e220b76e..794adfba 100644
--- a/adl_glc_ratios.py
+++ b/adl_glc_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 12th gen Core (code name Alderlake) with Golden Cove
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 12th gen Core (code name Alderlake) with Golden Cove
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,11 +16,14 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
-
+num_cores = 1
+num_threads = 1
+num_sockets = 1
+topdown_use_fixed = False
 
 def handle_error(obj, msg):
     print_error(msg)
@@ -46,7 +49,9 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
-Errata_Whitelist = "ADL038"
+Errata_Whitelist = "ADL038;ADL066"
+PERF_METRICS_MSR = 1
+DS = 0
 
 # Aux. formulas
 
@@ -55,7 +60,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -100,11 +105,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -173,7 +178,11 @@ def Retired_Slots(self, EV, level):
 
 # Number of logical processors (enabled or online) on the target system
 def Num_CPUs(self, EV, level):
-    return 24 if smt_enabled else 16
+    return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled))
+
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -187,7 +196,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -195,23 +204,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -229,24 +238,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -257,7 +272,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -273,7 +288,7 @@ def CLKS(self, EV, level):
 
 # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)
 def SLOTS(self, EV, level):
-    return EV("TOPDOWN.SLOTS", level)
+    return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level)
 
 # Fraction of Physical Core issue-slots utilized by this Logical Processor
 def Slots_Utilization(self, EV, level):
@@ -295,7 +310,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -303,12 +318,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -333,11 +342,11 @@ def IpBranch(self, EV, level):
 
 # Instructions per (near) call (lower number means higher occurrence rate)
 def IpCall(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level)
+    val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level)
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -353,31 +362,31 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -413,10 +422,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -445,6 +465,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level)
@@ -479,25 +505,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -507,7 +533,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -559,6 +585,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -575,19 +605,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -651,17 +677,17 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -733,11 +759,11 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.15)
         except ZeroDivisionError:
             handle_error(self, "Frontend_Bound zero division")
@@ -772,7 +798,7 @@ class Fetch_Latency:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2))
+            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Fetch_Latency zero division")
@@ -795,7 +821,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -820,7 +846,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -877,7 +903,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -925,7 +951,7 @@ class Unknown_Branches:
     sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1206,11 +1232,11 @@ class Branch_Mispredicts:
     sample = ['TOPDOWN.BR_MISPREDICT_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Branch_Mispredicts zero division")
@@ -1237,7 +1263,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1261,7 +1287,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1291,7 +1317,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1315,11 +1341,11 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             handle_error(self, "Backend_Bound zero division")
@@ -1351,7 +1377,7 @@ class Memory_Bound:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Memory_Bound zero division")
@@ -1407,8 +1433,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1439,7 +1465,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1463,7 +1489,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1508,13 +1534,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1567,7 +1618,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1596,7 +1647,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1646,7 +1697,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1674,7 +1725,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1701,7 +1752,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1729,7 +1780,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1777,7 +1828,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1815,7 +1866,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1871,7 +1922,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1899,7 +1950,7 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1954,7 +2005,8 @@ class Streaming_Stores:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -1978,7 +2030,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2086,8 +2138,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2112,7 +2164,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2271,7 +2323,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
+            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2381,7 +2433,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2544,11 +2596,11 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh
         except ZeroDivisionError:
             handle_error(self, "Retiring zero division")
@@ -2827,9 +2879,9 @@ def compute(self, EV):
             handle_error(self, "Int_Vector_256b zero division")
         return self.val
     desc = """
-This metric represents 256-bit vector Integer ADD/SUB/SAD or
-VNNI (Vector Neural Network Instructions) uops fraction the
-CPU has retired."""
+This metric represents 256-bit vector Integer
+ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions)
+uops fraction the CPU has retired."""
 
 
 class Memory_Operations:
@@ -2865,7 +2917,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2877,8 +2929,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -2891,7 +2943,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2941,7 +2993,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2996,7 +3048,7 @@ class Heavy_Operations:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
             handle_error(self, "Heavy_Operations zero division")
@@ -3071,7 +3123,7 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3212,7 +3264,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3232,7 +3284,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3253,7 +3305,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3264,7 +3316,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3273,7 +3326,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3293,7 +3346,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3313,7 +3366,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3333,7 +3386,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3343,7 +3396,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3354,7 +3407,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3375,7 +3428,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3397,7 +3450,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3407,10 +3460,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3419,7 +3471,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3429,31 +3481,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3508,7 +3583,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3690,7 +3765,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3712,26 +3788,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3848,7 +3904,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3910,8 +3966,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3931,8 +3988,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3952,8 +4010,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -3973,8 +4032,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -3994,8 +4053,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -4121,7 +4180,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4135,8 +4194,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4262,6 +4377,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4417,7 +4552,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4437,7 +4572,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4457,7 +4592,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4477,7 +4612,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4518,7 +4653,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4759,6 +4894,26 @@ def compute(self, EV):
 loads  (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -4856,8 +5011,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4876,8 +5030,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4896,8 +5049,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4916,8 +5068,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5200,7 +5351,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5219,7 +5370,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5273,7 +5424,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5553,6 +5704,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = FB_Full() ; r.run(n) ; o["FB_Full"] = n
@@ -5645,6 +5797,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["FB_Full"].parent = o["L1_Bound"]
@@ -5722,7 +5875,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5736,7 +5890,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5757,12 +5910,16 @@ def __init__(self, r):
         n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5788,6 +5945,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -5961,44 +6119,51 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
+        o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
-        o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
-        o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
+        o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
-        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
-        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
         o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
-        o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -6006,6 +6171,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Synchronization"].L1_Bound = o["L1_Bound"]
         o["Memory_Synchronization"].Retiring = o["Retiring"]
@@ -6081,7 +6247,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
         o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
-        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
@@ -6101,6 +6267,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
+        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
         o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
         o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
         o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
@@ -6123,19 +6290,19 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
         o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
         o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].LSD = o["LSD"]
         o["DSB_Misses"].MITE = o["MITE"]
@@ -6150,6 +6317,12 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].LSD = o["LSD"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6226,5 +6399,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/bdw_client_ratios.py b/bdw_client_ratios.py
index 557c36b0..6a1e9d1a 100644
--- a/bdw_client_ratios.py
+++ b/bdw_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 5th gen Core / Core M (code named Broadwell)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 5th gen Core / Core M (code named Broadwell)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -49,6 +52,8 @@ def handle_error_metric(obj, msg):
 OneBillion = 1000000000
 Energy_Unit = 61
 Errata_Whitelist = "BDE69;BDE70"
+EBS_Mode = 0
+DS = 0
 
 # Aux. formulas
 
@@ -88,11 +93,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def Frontend_RS_Empty_Cycles(self, EV, level):
     EV("RS_EVENTS.EMPTY_CYCLES", level)
@@ -183,7 +188,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -217,13 +222,13 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -249,7 +254,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -265,31 +270,31 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -303,7 +308,6 @@ def Instructions(self, EV, level):
 def Retire(self, EV, level):
     return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
@@ -323,7 +327,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -353,6 +357,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -365,15 +373,12 @@ def L2HPKI_Load(self, EV, level):
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -409,17 +414,17 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -477,7 +482,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -539,7 +544,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -564,7 +569,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -621,7 +626,7 @@ class Mispredicts_Resteers:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -669,7 +674,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -900,7 +905,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -931,7 +936,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -961,7 +966,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1053,8 +1058,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION:c1", 4) + 7 * EV("DTLB_LOAD_MISSES.WALK_COMPLETED", 4)) / CLKS(self, EV, 4)
@@ -1196,7 +1201,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1225,7 +1230,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1275,7 +1280,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1303,7 +1308,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1330,7 +1335,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1358,7 +1363,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1406,7 +1411,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1444,7 +1449,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1500,7 +1505,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1528,7 +1533,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1579,7 +1584,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1639,8 +1644,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -1787,7 +1792,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2068,7 +2073,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2345,7 +2350,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2451,7 +2456,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2612,7 +2617,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2731,7 +2737,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2793,8 +2799,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -2814,8 +2821,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -2835,8 +2843,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -2856,8 +2865,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -2877,8 +2886,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_Instructions:
@@ -2922,7 +2931,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -2936,8 +2945,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
 
 
 class Metric_DSB_Coverage:
@@ -3020,7 +3028,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -3145,6 +3153,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -3221,8 +3249,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -3241,8 +3268,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -3261,8 +3287,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -3405,7 +3430,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -3424,7 +3449,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3478,7 +3503,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -3845,6 +3870,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
diff --git a/bdx_server_ratios.py b/bdx_server_ratios.py
index fa52a13a..7a55edc1 100644
--- a/bdx_server_ratios.py
+++ b/bdx_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v4 (code named Broadwell EP)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v4 (code named Broadwell EP)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -49,6 +52,8 @@ def handle_error_metric(obj, msg):
 OneBillion = 1000000000
 Energy_Unit = 61
 Errata_Whitelist = "BDE69;BDE70"
+EBS_Mode = 0
+DS = 1
 
 # Aux. formulas
 
@@ -88,11 +93,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def Frontend_RS_Empty_Cycles(self, EV, level):
     EV("RS_EVENTS.EMPTY_CYCLES", level)
@@ -207,7 +212,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -241,13 +246,13 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -273,7 +278,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -289,31 +294,31 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -327,7 +332,6 @@ def Instructions(self, EV, level):
 def Retire(self, EV, level):
     return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
@@ -347,7 +351,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -377,6 +381,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -389,15 +397,12 @@ def L2HPKI_Load(self, EV, level):
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -433,11 +438,11 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -447,7 +452,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -509,7 +514,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -571,7 +576,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -596,7 +601,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -653,7 +658,7 @@ class Mispredicts_Resteers:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -701,7 +706,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -932,7 +937,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -963,7 +968,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -993,7 +998,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1085,8 +1090,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION:c1", 4) + 7 * EV("DTLB_LOAD_MISSES.WALK_COMPLETED", 4)) / CLKS(self, EV, 4)
@@ -1228,7 +1233,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1257,7 +1262,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1307,7 +1312,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1335,7 +1340,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1362,7 +1367,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1390,7 +1395,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1438,7 +1443,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1476,7 +1481,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1605,7 +1610,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1633,7 +1638,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1684,7 +1689,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1744,8 +1749,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -1892,7 +1897,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2173,7 +2178,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2450,7 +2455,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2556,7 +2561,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2717,7 +2722,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2836,7 +2842,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2898,8 +2904,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -2919,8 +2926,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -2940,8 +2948,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -2961,8 +2970,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -2982,8 +2991,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_Instructions:
@@ -3027,7 +3036,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -3041,8 +3050,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
 
 
 class Metric_DSB_Coverage:
@@ -3125,7 +3133,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -3250,6 +3258,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -3326,8 +3354,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -3346,8 +3373,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -3366,8 +3392,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -3510,7 +3535,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -3529,7 +3554,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3602,7 +3627,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -3997,6 +4022,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
diff --git a/clx_server_ratios.py b/clx_server_ratios.py
index 89d9553c..04517706 100644
--- a/clx_server_ratios.py
+++ b/clx_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 2nd gen (code named Cascade Lake)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 2nd gen (code named Cascade Lake)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 1
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,10 @@ def handle_error_metric(obj, msg):
 OneBillion = 1000000000
 Energy_Unit = 61
 Errata_Whitelist = "SKL091"
+EBS_Mode = 0
+Memory = 1
+PMM_App_Direct = 1 if Memory == 1 else 0
+DS = 1
 
 # Aux. formulas
 
@@ -59,7 +66,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -113,7 +120,7 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
@@ -139,19 +146,24 @@ def LOAD_LCL_MEM(self, EV, level):
     return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
 
 def LOAD_LCL_PMM(self, EV, level):
-    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_RETIRED.LOCAL_PMM", level)
+    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_FWD(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_HITM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_MEM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_PMM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_XSNP_HIT(self, EV, level):
     return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT", level)
@@ -166,7 +178,7 @@ def MEM_Bound_Ratio(self, EV, level):
     return EV("CYCLE_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) + L2_Bound_Ratio(self, EV, level) - self.L2_Bound.compute(EV)
 
 def Mem_DDR_Hit_Fraction(self, EV, level):
-    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level)))
+    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1
 
 def Mem_Lock_St_Fraction(self, EV, level):
     return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level)
@@ -233,11 +245,9 @@ def Retired_Slots(self, EV, level):
 def Num_CPUs(self, EV, level):
     return 112 if smt_enabled else 56
 
-def Memory(self, EV, level):
-    return 1
-
-def PMM_App_Direct(self, EV, level):
-    return 1 if Memory(self, EV, level)== 1 else 0
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -251,7 +261,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -259,23 +269,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -293,24 +303,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -321,7 +337,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -355,7 +371,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -363,15 +379,9 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -397,7 +407,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -413,37 +423,37 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX512(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -473,10 +483,17 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -497,6 +514,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2
@@ -531,7 +554,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -541,7 +564,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -589,6 +612,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -605,19 +632,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -681,11 +704,11 @@ def UC_Load_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -695,7 +718,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -747,7 +770,8 @@ def MEM_Parallel_Reads(self, EV, level):
 
 # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_PMM_Read_Latency(self, EV, level):
-    return (OneBillion *(EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level) / EV("UNC_M_PMM_RPQ_INSERTS", level)) / EV("UNC_M_CLOCKTICKS:one_unit", level))
+    EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level)
+    return (OneBillion *(EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level) / EV("UNC_M_PMM_RPQ_INSERTS", level)) / EV("UNC_M_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0
 
 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_DRAM_Read_Latency(self, EV, level):
@@ -755,11 +779,11 @@ def MEM_DRAM_Read_Latency(self, EV, level):
 
 # Average 3DXP Memory Bandwidth Use for reads [GB / sec]
 def PMM_Read_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec]
 def PMM_Write_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU
 def IO_Read_BW(self, EV, level):
@@ -797,7 +821,7 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -859,7 +883,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -884,7 +908,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -941,7 +965,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -989,7 +1013,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1243,7 +1267,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1274,7 +1298,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1298,7 +1322,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1328,7 +1352,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1352,7 +1376,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1444,8 +1468,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1476,7 +1500,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1500,7 +1524,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1545,13 +1569,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1635,7 +1684,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1664,7 +1713,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1714,7 +1763,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1742,7 +1791,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1769,7 +1818,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1797,7 +1846,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1825,7 +1874,7 @@ class DRAM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (MEM_Bound_Ratio(self, EV, 3) - self.PMM_Bound.compute(EV))
+            self.val = (MEM_Bound_Ratio(self, EV, 3) - self.PMM_Bound.compute(EV)) if PMM_App_Direct else MEM_Bound_Ratio(self, EV, 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "DRAM_Bound zero division")
@@ -1845,7 +1894,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1883,7 +1932,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1909,7 +1958,7 @@ class Local_MEM:
     area = "BE/Mem"
     level = 5
     htoff = False
-    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Server'])
@@ -1940,7 +1989,8 @@ class Remote_MEM:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_MEM zero division")
@@ -1964,7 +2014,9 @@ class Remote_Cache:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_Cache zero division")
@@ -1989,10 +2041,10 @@ class PMM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 )
-            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
+            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0
             EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3)
             EV("MEM_LOAD_RETIRED.L1_MISS", 3)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "PMM_Bound zero division")
@@ -2041,7 +2093,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2069,7 +2121,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2120,7 +2172,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2228,8 +2280,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2254,7 +2306,7 @@ class Serializing_Operation:
     sample = ['PARTIAL_RAT_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2342,7 +2394,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4)
+            self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2453,7 +2505,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2734,7 +2786,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3005,7 +3057,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3017,8 +3069,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -3031,7 +3083,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3081,7 +3133,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3186,7 +3238,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3277,7 +3329,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3297,7 +3349,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3318,7 +3370,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3329,7 +3381,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3338,7 +3391,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3358,7 +3411,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3378,7 +3431,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3398,7 +3451,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3408,7 +3461,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3419,7 +3472,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3440,7 +3493,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3462,7 +3515,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3472,10 +3525,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3484,7 +3536,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3494,31 +3546,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3573,7 +3648,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3734,7 +3809,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3756,26 +3832,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3892,7 +3948,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3954,8 +4010,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3975,8 +4032,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3996,8 +4054,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -4017,8 +4076,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -4038,8 +4097,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX512:
@@ -4059,8 +4118,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX512 zero division")
     desc = """
 Instructions per FP Arithmetic AVX 512-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -4166,7 +4225,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4180,8 +4239,45 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4267,6 +4363,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4422,7 +4538,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4463,7 +4579,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4684,6 +4800,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -4781,8 +4917,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4801,8 +4936,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4821,8 +4955,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4841,8 +4974,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5128,7 +5260,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5147,7 +5279,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5220,7 +5352,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5644,6 +5776,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
@@ -5736,6 +5869,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
@@ -5814,7 +5948,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5827,7 +5962,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5848,10 +5982,13 @@ def __init__(self, r):
         n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5873,6 +6010,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -5950,16 +6088,7 @@ def __init__(self, r):
         o["Core_Bound"].Retiring = o["Retiring"]
         o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"]
         o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Retiring = o["Retiring"]
-        o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilized_0"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"]
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -6040,6 +6169,7 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Bandwidth"].Backend_Bound = o["Backend_Bound"]
         o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
@@ -6056,28 +6186,35 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
+        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Latency"].Retiring = o["Retiring"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
         o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"]
         o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
-        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
         o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
-        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
-        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"]
@@ -6085,6 +6222,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"]
         o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -6157,20 +6295,12 @@ def __init__(self, r):
         o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"]
         o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"]
         o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
-        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Other_Bottlenecks"].Retiring = o["Retiring"]
-        o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"]
         o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"]
         o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"]
-        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"]
-        o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
         o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
-        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
         o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"]
@@ -6178,43 +6308,52 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"]
         o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"]
+        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
+        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
+        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
+        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
+        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
+        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
+        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
+        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
+        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
+        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
+        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"]
+        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
+        o["Other_Bottlenecks"].Divider = o["Divider"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].Assists = o["Assists"]
         o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"]
         o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"]
         o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"]
-        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
-        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
-        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
-        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
+        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"]
-        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
         o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"]
-        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
-        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"]
         o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"]
         o["Other_Bottlenecks"].LCP = o["LCP"]
-        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
-        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
         o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"]
-        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"]
         o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"]
-        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
-        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
-        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
+        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
@@ -6234,6 +6373,11 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6309,5 +6453,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py
index 0d7dd304..b24bf58f 100644
--- a/hsw_client_ratios.py
+++ b/hsw_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 4rd gen Core (code named Haswell)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 4rd gen Core (code named Haswell)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
+EBS_Mode = 0
+DS = 0
 
 # Aux. formulas
 
@@ -179,7 +184,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -201,13 +206,13 @@ def SLOTS(self, EV, level):
 def CoreIPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(self, EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(self, EV, level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -233,7 +238,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -267,7 +272,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -289,19 +294,20 @@ def L1MPKI(self, EV, level):
 def L2MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L3 cache true misses per kilo instruction for retired demand loads
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -337,11 +343,11 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -401,7 +407,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -463,7 +469,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -488,7 +494,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -750,7 +756,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -781,7 +787,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -811,7 +817,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -903,8 +909,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -1046,7 +1052,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1075,7 +1081,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1125,7 +1131,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1153,7 +1159,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1180,7 +1186,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1208,7 +1214,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1256,7 +1262,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1294,7 +1300,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1350,7 +1356,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1378,7 +1384,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1429,7 +1435,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1489,8 +1495,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_UOPS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(self, EV, 3)
@@ -1637,7 +1643,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1918,7 +1924,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2042,7 +2048,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2148,7 +2154,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2247,7 +2253,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2366,7 +2373,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2508,7 +2515,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -2593,6 +2600,26 @@ def compute(self, EV):
 loads"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L3MPKI:
     name = "L3MPKI"
     domain = "Metric"
@@ -2629,8 +2656,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -2649,8 +2675,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -2669,8 +2694,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -2813,7 +2837,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -2832,7 +2856,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3202,6 +3226,7 @@ def __init__(self, r):
         n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n
         n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n
         n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n
diff --git a/hsx_server_ratios.py b/hsx_server_ratios.py
index 755f2349..29ad7ea2 100644
--- a/hsx_server_ratios.py
+++ b/hsx_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v3 (code Named Haswell EP)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v3 (code Named Haswell EP)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
+EBS_Mode = 0
+DS = 1
 
 # Aux. formulas
 
@@ -203,7 +208,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -225,13 +230,13 @@ def SLOTS(self, EV, level):
 def CoreIPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(self, EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(self, EV, level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -257,7 +262,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -291,7 +296,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -313,19 +318,20 @@ def L1MPKI(self, EV, level):
 def L2MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L3 cache true misses per kilo instruction for retired demand loads
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -361,11 +367,11 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -433,7 +439,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -495,7 +501,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -520,7 +526,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -782,7 +788,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -813,7 +819,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -843,7 +849,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -935,8 +941,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -1078,7 +1084,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1107,7 +1113,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1157,7 +1163,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1185,7 +1191,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1212,7 +1218,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1240,7 +1246,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1288,7 +1294,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1326,7 +1332,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1455,7 +1461,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1483,7 +1489,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1534,7 +1540,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1594,8 +1600,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_UOPS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(self, EV, 3)
@@ -1742,7 +1748,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2023,7 +2029,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2147,7 +2153,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2253,7 +2259,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2352,7 +2358,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2471,7 +2478,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2613,7 +2620,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -2698,6 +2705,26 @@ def compute(self, EV):
 loads"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L3MPKI:
     name = "L3MPKI"
     domain = "Metric"
@@ -2734,8 +2761,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -2754,8 +2780,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -2774,8 +2799,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -2918,7 +2942,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -2937,7 +2961,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3354,6 +3378,7 @@ def __init__(self, r):
         n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n
         n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n
         n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n
diff --git a/icl_client_ratios.py b/icl_client_ratios.py
index 114f6aa2..c3a7d3d9 100644
--- a/icl_client_ratios.py
+++ b/icl_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 10th gen Core (code name Icelake)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 10th gen Core (code name Icelake)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,11 +16,14 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
-
+num_cores = 1
+num_threads = 1
+num_sockets = 1
+topdown_use_fixed = False
 
 def handle_error(obj, msg):
     print_error(msg)
@@ -47,6 +50,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
+PERF_METRICS_MSR = 1
+DS = 0
 
 # Aux. formulas
 
@@ -58,7 +63,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -103,11 +108,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -184,6 +189,10 @@ def Retired_Slots(self, EV, level):
 def Num_CPUs(self, EV, level):
     return 8 if smt_enabled else 4
 
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
+
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
     val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)))
@@ -196,7 +205,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -204,23 +213,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -238,24 +247,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -266,7 +281,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -282,7 +297,7 @@ def CLKS(self, EV, level):
 
 # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)
 def SLOTS(self, EV, level):
-    return EV("TOPDOWN.SLOTS", level)
+    return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level)
 
 # Fraction of Physical Core issue-slots utilized by this Logical Processor
 def Slots_Utilization(self, EV, level):
@@ -304,7 +319,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -312,12 +327,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -346,7 +355,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -362,37 +371,37 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX512(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -422,10 +431,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -450,6 +470,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level)
@@ -484,25 +510,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -512,7 +538,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -564,6 +590,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all demand loads  (including speculative)
 def L2HPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_HIT", level) / EV("INST_RETIRED.ANY", level)
@@ -576,19 +606,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -652,17 +678,17 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -736,11 +762,11 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.15)
         except ZeroDivisionError:
             handle_error(self, "Frontend_Bound zero division")
@@ -798,7 +824,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -823,7 +849,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -880,7 +906,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -928,7 +954,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1232,7 +1258,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1263,7 +1289,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1287,7 +1313,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1317,7 +1343,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1341,11 +1367,11 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) + Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             handle_error(self, "Backend_Bound zero division")
@@ -1433,8 +1459,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1465,7 +1491,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1489,7 +1515,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1534,13 +1560,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1624,7 +1675,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1653,7 +1704,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1703,7 +1754,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1731,7 +1782,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1758,7 +1809,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1786,7 +1837,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1834,7 +1885,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1872,7 +1923,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1928,7 +1979,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1956,7 +2007,7 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2011,7 +2062,8 @@ class Streaming_Stores:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -2035,7 +2087,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2143,8 +2195,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2169,7 +2221,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2257,7 +2309,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4)
+            self.val = EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2368,7 +2420,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2557,11 +2609,11 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh
         except ZeroDivisionError:
             handle_error(self, "Retiring zero division")
@@ -2829,7 +2881,7 @@ class Branch_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2876,7 +2928,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2982,7 +3034,7 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3073,7 +3125,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3093,7 +3145,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3114,7 +3166,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3125,7 +3177,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3134,7 +3187,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3154,7 +3207,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3174,7 +3227,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3194,7 +3247,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3204,7 +3257,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3215,7 +3268,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3236,7 +3289,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3258,7 +3311,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3268,10 +3321,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3280,7 +3332,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3290,31 +3342,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3369,7 +3444,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3550,7 +3625,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3572,26 +3648,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3708,7 +3764,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3770,8 +3826,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3791,8 +3848,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3812,8 +3870,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -3833,8 +3892,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -3854,8 +3913,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX512:
@@ -3875,8 +3934,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX512 zero division")
     desc = """
 Instructions per FP Arithmetic AVX 512-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -3982,7 +4041,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -3996,8 +4055,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4103,6 +4218,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4258,7 +4393,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4278,7 +4413,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4298,7 +4433,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4318,7 +4453,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4359,7 +4494,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4600,6 +4735,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_Load:
     name = "L2HPKI_Load"
     domain = "Metric"
@@ -4677,8 +4832,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4697,8 +4851,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4717,8 +4870,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4737,8 +4889,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5021,7 +5172,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5040,7 +5191,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5094,7 +5245,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5376,6 +5527,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
@@ -5462,6 +5614,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
@@ -5532,7 +5685,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5546,7 +5700,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5567,11 +5720,15 @@ def __init__(self, r):
         n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5597,6 +5754,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_FB_HPKI() ; r.metric(n) ; o["FB_HPKI"] = n
@@ -5675,14 +5833,7 @@ def __init__(self, r):
         o["Core_Bound"].Retiring = o["Retiring"]
         o["Core_Bound"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilized_0"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"]
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -5774,8 +5925,9 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Retiring = o["Retiring"]
+        o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
@@ -5791,38 +5943,46 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Cache_Memory_Latency"].Retiring = o["Retiring"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
-        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
-        o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
-        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
-        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
         o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
-        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
         o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
-        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
+        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
         o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"]
         o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Memory_Data_TLBs"].Retiring = o["Retiring"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Store_Bound = o["Store_Bound"]
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -5889,19 +6049,12 @@ def __init__(self, r):
         o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"]
         o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"]
         o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
-        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Other_Bottlenecks"].Retiring = o["Retiring"]
         o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"]
         o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"]
-        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"]
-        o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
         o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
-        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
         o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"]
@@ -5909,52 +6062,60 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"]
         o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"]
+        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
+        o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
+        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
+        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
+        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
+        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
+        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
+        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
+        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
+        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
+        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
+        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
+        o["Other_Bottlenecks"].Divider = o["Divider"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].Assists = o["Assists"]
         o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"]
         o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"]
         o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"]
-        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
-        o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
-        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
-        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
-        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
+        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"]
-        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
         o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"]
-        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
-        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"]
         o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"]
         o["Other_Bottlenecks"].LCP = o["LCP"]
-        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
-        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
         o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"]
-        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"]
         o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"]
-        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
-        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
-        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
-        o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
-        o["Core_Bound_Likely"].Retiring = o["Retiring"]
+        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
+        o["Useful_Work"].Assists = o["Assists"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
+        o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["Core_Bound_Likely"].Retiring = o["Retiring"]
+        o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].LSD = o["LSD"]
         o["DSB_Misses"].MITE = o["MITE"]
@@ -5969,6 +6130,12 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].LSD = o["LSD"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6045,5 +6212,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/icx_server_ratios.py b/icx_server_ratios.py
index 5ac5e0c6..673718a6 100644
--- a/icx_server_ratios.py
+++ b/icx_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 3rd gen (code name Icelake Server)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 3rd gen (code name Icelake Server)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,11 +16,14 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
-
+num_cores = 1
+num_threads = 1
+num_sockets = 1
+topdown_use_fixed = False
 
 def handle_error(obj, msg):
     print_error(msg)
@@ -47,6 +50,10 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
+Memory = 1
+PMM_App_Direct = 1 if Memory == 1 else 0
+PERF_METRICS_MSR = 1
+DS = 1
 
 # Aux. formulas
 
@@ -58,7 +65,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -103,11 +110,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -129,19 +136,24 @@ def LOAD_LCL_MEM(self, EV, level):
     return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
 
 def LOAD_LCL_PMM(self, EV, level):
-    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_RETIRED.LOCAL_PMM", level)
+    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_FWD(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_HITM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_MEM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_PMM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_XSNP_HIT(self, EV, level):
     return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT", level)
@@ -156,7 +168,7 @@ def MEM_Bound_Ratio(self, EV, level):
     return EV("CYCLE_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) + L2_Bound_Ratio(self, EV, level) - self.L2_Bound.compute(EV)
 
 def Mem_DDR_Hit_Fraction(self, EV, level):
-    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level)))
+    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1
 
 def Mem_Lock_St_Fraction(self, EV, level):
     return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level)
@@ -215,13 +227,11 @@ def Retired_Slots(self, EV, level):
 
 # Number of logical processors (enabled or online) on the target system
 def Num_CPUs(self, EV, level):
-    return 160 if smt_enabled else 80
-
-def Memory(self, EV, level):
-    return 1
+    return num_cores * num_sockets * num_threads if num_cores else 160 /(2 - smt_enabled )
 
-def PMM_App_Direct(self, EV, level):
-    return 1 if Memory(self, EV, level)== 1 else 0
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -235,7 +245,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -243,23 +253,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -277,24 +287,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -305,7 +321,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -321,7 +337,7 @@ def CLKS(self, EV, level):
 
 # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)
 def SLOTS(self, EV, level):
-    return EV("TOPDOWN.SLOTS", level)
+    return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level)
 
 # Fraction of Physical Core issue-slots utilized by this Logical Processor
 def Slots_Utilization(self, EV, level):
@@ -343,7 +359,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -351,12 +367,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -385,7 +395,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -401,37 +411,37 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX512(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -461,10 +471,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -485,6 +506,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level)
@@ -519,25 +546,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -547,7 +574,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -599,6 +626,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all demand loads  (including speculative)
 def L2HPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_HIT", level) / EV("INST_RETIRED.ANY", level)
@@ -611,19 +642,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -695,11 +722,11 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -709,7 +736,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -761,7 +788,9 @@ def MEM_Parallel_Reads(self, EV, level):
 
 # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_PMM_Read_Latency(self, EV, level):
-    return 0
+    EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level)
+    EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)
+    return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0
 
 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_DRAM_Read_Latency(self, EV, level):
@@ -769,11 +798,11 @@ def MEM_DRAM_Read_Latency(self, EV, level):
 
 # Average 3DXP Memory Bandwidth Use for reads [GB / sec]
 def PMM_Read_BW(self, EV, level):
-    return 0
+    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec]
 def PMM_Write_BW(self, EV, level):
-    return 0
+    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU
 def IO_Read_BW(self, EV, level):
@@ -811,11 +840,11 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.15)
         except ZeroDivisionError:
             handle_error(self, "Frontend_Bound zero division")
@@ -873,7 +902,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -898,7 +927,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -955,7 +984,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1003,7 +1032,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1280,7 +1309,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1311,7 +1340,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1335,7 +1364,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1365,7 +1394,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1389,11 +1418,11 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) + Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             handle_error(self, "Backend_Bound zero division")
@@ -1481,8 +1510,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1513,7 +1542,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1537,7 +1566,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1582,13 +1611,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1672,7 +1726,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1701,7 +1755,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1751,7 +1805,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1779,7 +1833,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1806,7 +1860,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1834,7 +1888,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1882,7 +1936,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1920,7 +1974,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1946,7 +2000,7 @@ class Local_MEM:
     area = "BE/Mem"
     level = 5
     htoff = False
-    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Server'])
@@ -1977,7 +2031,8 @@ class Remote_MEM:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_MEM zero division")
@@ -2001,7 +2056,9 @@ class Remote_Cache:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_Cache zero division")
@@ -2026,7 +2083,10 @@ class PMM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 0
+            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0
+            EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3)
+            EV("MEM_LOAD_RETIRED.L1_MISS", 3)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "PMM_Bound zero division")
@@ -2075,7 +2135,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2103,7 +2163,7 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2158,7 +2218,8 @@ class Streaming_Stores:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -2182,7 +2243,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2290,8 +2351,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2316,7 +2377,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2404,7 +2465,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4)
+            self.val = EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2515,7 +2576,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2704,11 +2765,11 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh
         except ZeroDivisionError:
             handle_error(self, "Retiring zero division")
@@ -2976,7 +3037,7 @@ class Branch_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3023,7 +3084,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3129,7 +3190,7 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3220,7 +3281,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3240,7 +3301,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3261,7 +3322,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3272,7 +3333,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3281,7 +3343,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3301,7 +3363,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3321,7 +3383,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3341,7 +3403,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3351,7 +3413,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3362,7 +3424,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3383,7 +3445,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3405,7 +3467,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3415,10 +3477,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3427,7 +3488,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3437,31 +3498,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3516,7 +3600,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3697,7 +3781,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3719,26 +3804,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3855,7 +3920,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3917,8 +3982,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3938,8 +4004,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3959,8 +4026,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -3980,8 +4048,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -4001,8 +4069,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX512:
@@ -4022,8 +4090,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX512 zero division")
     desc = """
 Instructions per FP Arithmetic AVX 512-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -4129,7 +4197,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4143,8 +4211,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4230,6 +4354,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4385,7 +4529,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4405,7 +4549,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4425,7 +4569,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4445,7 +4589,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4486,7 +4630,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4727,6 +4871,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_Load:
     name = "L2HPKI_Load"
     domain = "Metric"
@@ -4804,8 +4968,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4824,8 +4987,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4844,8 +5006,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4864,8 +5025,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5189,7 +5349,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5208,7 +5368,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5281,7 +5441,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5706,6 +5866,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
@@ -5795,6 +5956,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
@@ -5869,7 +6031,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5883,7 +6046,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5904,10 +6066,14 @@ def __init__(self, r):
         n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5933,6 +6099,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_FB_HPKI() ; r.metric(n) ; o["FB_HPKI"] = n
@@ -6015,20 +6182,14 @@ def __init__(self, r):
         o["Load_STLB_Hit"].DTLB_Load = o["DTLB_Load"]
         o["DRAM_Bound"].L2_Bound = o["L2_Bound"]
         o["MEM_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["PMM_Bound"].L2_Bound = o["L2_Bound"]
         o["Store_STLB_Hit"].DTLB_Store = o["DTLB_Store"]
         o["Store_STLB_Hit"].Store_STLB_Miss = o["Store_STLB_Miss"]
         o["Core_Bound"].Memory_Bound = o["Memory_Bound"]
         o["Core_Bound"].Retiring = o["Retiring"]
         o["Core_Bound"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilized_0"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"]
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -6117,45 +6278,53 @@ def __init__(self, r):
         o["Instruction_Fetch_BW"].MS_Switches = o["MS_Switches"]
         o["Instruction_Fetch_BW"].Unknown_Branches = o["Unknown_Branches"]
         o["Cache_Memory_Bandwidth"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Cache_Memory_Bandwidth"].Retiring = o["Retiring"]
         o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
-        o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
-        o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
-        o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Bandwidth"].Backend_Bound = o["Backend_Bound"]
-        o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
         o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
+        o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
         o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
-        o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
+        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Latency"].Retiring = o["Retiring"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
         o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
-        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
-        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
-        o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
-        o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
         o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
         o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
+        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
@@ -6163,15 +6332,16 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Memory_Data_TLBs"].Retiring = o["Retiring"]
         o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Store_Bound = o["Store_Bound"]
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -6239,20 +6409,12 @@ def __init__(self, r):
         o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"]
         o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"]
         o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
-        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Other_Bottlenecks"].Retiring = o["Retiring"]
-        o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"]
         o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"]
         o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"]
-        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"]
-        o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
         o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
-        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
         o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"]
@@ -6260,52 +6422,61 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"]
         o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"]
+        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
+        o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
+        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
+        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
+        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
+        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
+        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
+        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
+        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
+        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
+        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
+        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"]
+        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
+        o["Other_Bottlenecks"].Divider = o["Divider"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].Assists = o["Assists"]
         o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"]
         o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"]
         o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"]
-        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
-        o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
-        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
-        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
-        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
+        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"]
-        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
         o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"]
-        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
-        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"]
         o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"]
         o["Other_Bottlenecks"].LCP = o["LCP"]
-        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
-        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
         o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"]
-        o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
         o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"]
         o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"]
-        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
-        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
-        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
-        o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
-        o["Core_Bound_Likely"].Retiring = o["Retiring"]
+        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
+        o["Useful_Work"].Assists = o["Assists"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
+        o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["Core_Bound_Likely"].Retiring = o["Retiring"]
+        o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].MITE = o["MITE"]
         o["DSB_Misses"].LCP = o["LCP"]
@@ -6319,6 +6490,11 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6396,5 +6572,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py
index b1adfcfc..495dc1f6 100644
--- a/ivb_client_ratios.py
+++ b/ivb_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 3rd gen Core (code named IvyBridge)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 3rd gen Core (code named IvyBridge)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 15.6
+EBS_Mode = 0
+DS = 0
 
 # Aux. formulas
 
@@ -191,7 +196,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -221,13 +226,13 @@ def CoreIPC(self, EV, level):
 def FLOPc(self, EV, level):
     return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -253,7 +258,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -263,7 +268,7 @@ def IpTB(self, EV, level):
 def BpTkBranch(self, EV, level):
     return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = 1 /(self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV))
     self.thresh = (val < 10)
@@ -277,7 +282,6 @@ def Instructions(self, EV, level):
 def Retire(self, EV, level):
     return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
@@ -297,7 +301,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -319,19 +323,20 @@ def L1MPKI(self, EV, level):
 def L2MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L3 cache true misses per kilo instruction for retired demand loads
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -367,17 +372,17 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -435,7 +440,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -497,7 +502,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -522,7 +527,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -784,7 +789,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -815,7 +820,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -845,7 +850,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -937,8 +942,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -1080,7 +1085,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1109,7 +1114,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1159,7 +1164,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1187,7 +1192,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1214,7 +1219,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1242,7 +1247,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1290,7 +1295,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1328,7 +1333,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1384,7 +1389,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1412,7 +1417,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1463,7 +1468,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1523,8 +1528,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -1671,7 +1676,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1905,7 +1910,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2182,7 +2187,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2288,7 +2293,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2427,7 +2432,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2546,7 +2552,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2586,8 +2592,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_Instructions:
@@ -2631,7 +2638,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -2645,8 +2652,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
 
 
 class Metric_DSB_Coverage:
@@ -2729,7 +2735,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -2814,6 +2820,26 @@ def compute(self, EV):
 loads"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L3MPKI:
     name = "L3MPKI"
     domain = "Metric"
@@ -2850,8 +2876,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -2870,8 +2895,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -2890,8 +2914,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -3034,7 +3057,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -3053,7 +3076,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3107,7 +3130,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -3456,6 +3479,7 @@ def __init__(self, r):
         n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n
         n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n
         n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n
diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py
index 0351fa1f..a321a9b3 100644
--- a/ivb_server_ratios.py
+++ b/ivb_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v2 (code named IvyBridge EP)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v2 (code named IvyBridge EP)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 15.6
+EBS_Mode = 0
+DS = 1
 
 # Aux. formulas
 
@@ -215,7 +220,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -245,13 +250,13 @@ def CoreIPC(self, EV, level):
 def FLOPc(self, EV, level):
     return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -277,7 +282,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -287,7 +292,7 @@ def IpTB(self, EV, level):
 def BpTkBranch(self, EV, level):
     return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = 1 /(self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV))
     self.thresh = (val < 10)
@@ -301,7 +306,6 @@ def Instructions(self, EV, level):
 def Retire(self, EV, level):
     return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
@@ -321,7 +325,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -343,19 +347,20 @@ def L1MPKI(self, EV, level):
 def L2MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L3 cache true misses per kilo instruction for retired demand loads
 def L3MPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
@@ -391,11 +396,11 @@ def Data_L2_MLP(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -405,7 +410,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -467,7 +472,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -529,7 +534,7 @@ class ICache_Misses:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -554,7 +559,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -816,7 +821,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -847,7 +852,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -877,7 +882,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -969,8 +974,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -1112,7 +1117,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1141,7 +1146,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1191,7 +1196,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1219,7 +1224,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1246,7 +1251,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1274,7 +1279,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1322,7 +1327,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1360,7 +1365,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1489,7 +1494,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1517,7 +1522,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1568,7 +1573,7 @@ class DTLB_Store:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1628,8 +1633,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -1776,7 +1781,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2010,7 +2015,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2287,7 +2292,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY_WB_ASSIST']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2393,7 +2398,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -2532,7 +2537,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -2651,7 +2657,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -2691,8 +2697,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_Instructions:
@@ -2736,7 +2743,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -2750,8 +2757,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
 
 
 class Metric_DSB_Coverage:
@@ -2834,7 +2840,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -2919,6 +2925,26 @@ def compute(self, EV):
 loads"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L3MPKI:
     name = "L3MPKI"
     domain = "Metric"
@@ -2955,8 +2981,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -2975,8 +3000,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -2995,8 +3019,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -3139,7 +3162,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -3158,7 +3181,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -3231,7 +3254,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -3608,6 +3631,7 @@ def __init__(self, r):
         n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n
         n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
         n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n
         n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n
diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py
index c78712b5..ca9197a2 100644
--- a/jkt_server_ratios.py
+++ b/jkt_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 (code named SandyBridge EP)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 (code named SandyBridge EP)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -45,6 +48,8 @@ def handle_error_metric(obj, msg):
 Pipeline_Width = 4
 OneMillion = 1000000
 OneBillion = 1000000000
+EBS_Mode = 0
+DS = 1
 
 # Aux. formulas
 
@@ -161,13 +166,13 @@ def CoreIPC(self, EV, level):
 def FLOPc(self, EV, level):
     return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_DISPATCHED.THREAD", level) / Execute_Cycles(self, EV, level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Total number of retired Instructions
 def Instructions(self, EV, level):
@@ -185,11 +190,11 @@ def DSB_Coverage(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -199,7 +204,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -261,7 +266,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -323,7 +328,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -527,7 +532,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -558,7 +563,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -588,7 +593,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -650,8 +655,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -730,7 +735,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -768,7 +773,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -855,8 +860,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -914,7 +919,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1356,7 +1361,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -1444,7 +1450,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -1463,7 +1469,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -1536,7 +1542,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
diff --git a/skl_client_ratios.py b/skl_client_ratios.py
index 774e13d3..61bbdf65 100644
--- a/skl_client_ratios.py
+++ b/skl_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 6th/7th gen Core (code named Skykale/Kabylake/Coffeelake)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 6th/7th gen Core (code named Skykale/Kabylake/Coffeelake)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneBillion = 1000000000
 Energy_Unit = 61
 Errata_Whitelist = "SKL091"
+EBS_Mode = 0
+DS = 0
 
 # Aux. formulas
 
@@ -59,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -113,11 +118,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -200,6 +205,10 @@ def Retired_Slots(self, EV, level):
 def Num_CPUs(self, EV, level):
     return 8 if smt_enabled else 4
 
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
+
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
     val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)))
@@ -212,7 +221,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -220,23 +229,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -254,24 +263,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -282,7 +297,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -316,7 +331,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -324,15 +339,9 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -358,7 +367,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -374,31 +383,31 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -428,10 +437,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -456,6 +476,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2
@@ -490,7 +516,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -500,7 +526,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -548,6 +574,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -564,19 +594,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -632,17 +658,17 @@ def UC_Load_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -708,7 +734,7 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -770,7 +796,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -795,7 +821,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -852,7 +878,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -900,7 +926,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1181,7 +1207,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1212,7 +1238,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1236,7 +1262,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1266,7 +1292,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1290,7 +1316,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1382,8 +1408,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1414,7 +1440,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1438,7 +1464,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1483,13 +1509,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1573,7 +1624,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1602,7 +1653,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1652,7 +1703,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1680,7 +1731,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1707,7 +1758,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1735,7 +1786,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1783,7 +1834,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1821,7 +1872,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1877,7 +1928,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1905,7 +1956,7 @@ class False_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1956,7 +2007,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2064,8 +2115,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2090,7 +2141,7 @@ class Serializing_Operation:
     sample = ['PARTIAL_RAT_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2178,7 +2229,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4)
+            self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2289,7 +2340,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2570,7 +2621,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2817,7 +2868,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2829,8 +2880,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -2843,7 +2894,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2893,7 +2944,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2998,7 +3049,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3089,7 +3140,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3109,7 +3160,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3130,7 +3181,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3141,7 +3192,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3150,7 +3202,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3170,7 +3222,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3190,7 +3242,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3210,7 +3262,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3220,7 +3272,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3231,7 +3283,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3252,7 +3304,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3274,7 +3326,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3284,10 +3336,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3296,7 +3347,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3306,31 +3357,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3385,7 +3459,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3546,7 +3620,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3568,26 +3643,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3704,7 +3759,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3766,8 +3821,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3787,8 +3843,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3808,8 +3865,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -3829,8 +3887,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -3850,8 +3908,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -3957,7 +4015,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -3971,8 +4029,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4078,6 +4192,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4233,7 +4367,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4274,7 +4408,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4495,6 +4629,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -4592,8 +4746,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4612,8 +4765,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4632,8 +4784,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4652,8 +4803,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -4898,7 +5048,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -4917,7 +5067,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -4971,7 +5121,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5228,6 +5378,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
@@ -5316,6 +5467,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
@@ -5389,7 +5541,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5402,7 +5555,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5422,11 +5574,15 @@ def __init__(self, r):
         n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5448,6 +5604,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -5512,16 +5669,7 @@ def __init__(self, r):
         o["Core_Bound"].Retiring = o["Retiring"]
         o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"]
         o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Retiring = o["Retiring"]
-        o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilized_0"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"]
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -5610,6 +5758,7 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
@@ -5617,27 +5766,34 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
-        o["Cache_Memory_Latency"].Retiring = o["Retiring"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Latency"].Retiring = o["Retiring"]
         o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
-        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
         o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
-        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"]
@@ -5645,6 +5801,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"]
         o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -5715,19 +5872,12 @@ def __init__(self, r):
         o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"]
         o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"]
         o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
-        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Other_Bottlenecks"].Retiring = o["Retiring"]
         o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"]
         o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"]
-        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"]
-        o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
         o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
-        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
         o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"]
@@ -5735,43 +5885,51 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"]
         o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"]
+        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
+        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
+        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
+        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
+        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
+        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
+        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
+        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
+        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
+        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
+        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
+        o["Other_Bottlenecks"].Divider = o["Divider"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].Assists = o["Assists"]
         o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"]
         o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"]
         o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"]
-        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
-        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
-        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
         o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"]
-        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
         o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"]
-        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
-        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"]
         o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"]
         o["Other_Bottlenecks"].LCP = o["LCP"]
-        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
-        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
         o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"]
-        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
         o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"]
         o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"]
-        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
-        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
-        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
+        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
@@ -5792,6 +5950,12 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].LSD = o["LSD"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -5865,5 +6029,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/skx_server_ratios.py b/skx_server_ratios.py
index a417876a..8137f5d0 100644
--- a/skx_server_ratios.py
+++ b/skx_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors (code named Skylake Server)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors (code named Skylake Server)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -48,6 +51,8 @@ def handle_error_metric(obj, msg):
 OneBillion = 1000000000
 Energy_Unit = 61
 Errata_Whitelist = "SKL091"
+EBS_Mode = 0
+DS = 1
 
 # Aux. formulas
 
@@ -59,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -113,7 +118,7 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
@@ -218,6 +223,10 @@ def Retired_Slots(self, EV, level):
 def Num_CPUs(self, EV, level):
     return 8 if smt_enabled else 4
 
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
+
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
     val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)))
@@ -230,7 +239,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -238,25 +247,25 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * ((self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV))) * self.Remote_Cache.compute(EV) / (self.Remote_Cache.compute(EV) + self.Remote_MEM.compute(EV) + self.Local_MEM.compute(EV)) + (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
+    val = 100 *(self.Memory_Bound.compute(EV) * ((self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV))) * self.Remote_Cache.compute(EV) / (self.Remote_Cache.compute(EV) + self.Remote_MEM.compute(EV) + self.Local_MEM.compute(EV)) + (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) if DS else 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
     return val
 
@@ -272,24 +281,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -300,7 +315,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -334,7 +349,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -342,15 +357,9 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills
 def IpLoad(self, EV, level):
@@ -376,7 +385,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -392,37 +401,37 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX512(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
@@ -448,10 +457,17 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -472,6 +488,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2
@@ -506,7 +528,7 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level))
     self.thresh = (val < 1000)
@@ -516,7 +538,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -564,6 +586,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -580,19 +606,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -656,11 +678,11 @@ def UC_Load_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -670,7 +692,7 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -760,7 +782,7 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -822,7 +844,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -847,7 +869,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -904,7 +926,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -952,7 +974,7 @@ class Unknown_Branches:
     sample = ['BACLEARS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1206,7 +1228,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1237,7 +1259,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1261,7 +1283,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1291,7 +1313,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1315,7 +1337,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1407,8 +1429,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1439,7 +1461,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1463,7 +1485,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1508,13 +1530,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1598,7 +1645,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1627,7 +1674,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1677,7 +1724,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1705,7 +1752,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1732,7 +1779,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1760,7 +1807,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1808,7 +1855,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1846,7 +1893,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1872,7 +1919,7 @@ class Local_MEM:
     area = "BE/Mem"
     level = 5
     htoff = False
-    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Server'])
@@ -1903,7 +1950,8 @@ class Remote_MEM:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_MEM zero division")
@@ -1927,7 +1975,9 @@ class Remote_Cache:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_Cache zero division")
@@ -1975,7 +2025,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2003,11 +2053,11 @@ class False_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_miss_remote_hitm(self, EV, 4) + Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4)) / CLKS(self, EV, 4)
+            self.val = (Mem_Remote_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_miss_remote_hitm(self, EV, 4) + Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4)) / CLKS(self, EV, 4) if DS else Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "False_Sharing zero division")
@@ -2054,7 +2104,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2162,8 +2212,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2188,7 +2238,7 @@ class Serializing_Operation:
     sample = ['PARTIAL_RAT_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2253,7 +2303,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4)
+            self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2364,7 +2414,7 @@ class Ports_Utilized_3m:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2645,7 +2695,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2916,7 +2966,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2928,8 +2978,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -2942,7 +2992,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2992,7 +3042,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3097,7 +3147,7 @@ class Assists:
     sample = ['OTHER_ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3188,7 +3238,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3208,7 +3258,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3229,7 +3279,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3240,7 +3290,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3249,7 +3300,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3269,7 +3320,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3289,7 +3340,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3309,7 +3360,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3319,7 +3370,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3330,7 +3381,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3351,7 +3402,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3373,7 +3424,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3383,10 +3434,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3395,7 +3445,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3405,31 +3455,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3484,7 +3557,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3645,7 +3718,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -3667,26 +3741,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3803,7 +3857,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -3865,8 +3919,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -3886,8 +3941,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -3907,8 +3963,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -3928,8 +3985,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -3949,8 +4006,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX512:
@@ -3970,8 +4027,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX512 zero division")
     desc = """
 Instructions per FP Arithmetic AVX 512-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpSWPF:
@@ -4057,7 +4114,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4071,8 +4128,45 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4158,6 +4252,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4313,7 +4427,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4354,7 +4468,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -4575,6 +4689,26 @@ def compute(self, EV):
 (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -4672,8 +4806,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4692,8 +4825,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4712,8 +4844,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4732,8 +4863,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5019,7 +5149,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5038,7 +5168,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5111,7 +5241,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -5476,6 +5606,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
@@ -5566,6 +5697,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
@@ -5642,7 +5774,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5655,7 +5788,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5675,10 +5807,13 @@ def __init__(self, r):
         n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5700,6 +5835,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -5772,16 +5908,7 @@ def __init__(self, r):
         o["Core_Bound"].Retiring = o["Retiring"]
         o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"]
         o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"]
-        o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"]
         o["Ports_Utilization"].Retiring = o["Retiring"]
-        o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"]
-        o["Ports_Utilized_0"].Retiring = o["Retiring"]
-        o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"]
-        o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"]
-        o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"]
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -5870,6 +5997,7 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
@@ -5877,27 +6005,34 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
-        o["Cache_Memory_Latency"].Retiring = o["Retiring"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Latency"].Retiring = o["Retiring"]
         o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"]
-        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
         o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
-        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"]
@@ -5905,6 +6040,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"]
         o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -5980,20 +6116,13 @@ def __init__(self, r):
         o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"]
         o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"]
         o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"]
-        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
         o["Other_Bottlenecks"].Local_MEM = o["Local_MEM"]
-        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
         o["Other_Bottlenecks"].Retiring = o["Retiring"]
         o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"]
         o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"]
-        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"]
-        o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
-        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
         o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
-        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
         o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"]
@@ -6001,6 +6130,25 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"]
         o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"]
+        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
+        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
+        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
+        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
+        o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
+        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
+        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
+        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
+        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
+        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
+        o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"]
+        o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"]
+        o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"]
+        o["Other_Bottlenecks"].Divider = o["Divider"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].Assists = o["Assists"]
         o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"]
         o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"]
@@ -6009,37 +6157,26 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"]
         o["Other_Bottlenecks"].Remote_MEM = o["Remote_MEM"]
-        o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"]
+        o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"]
         o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"]
-        o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
-        o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
+        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"]
-        o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"]
         o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"]
-        o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"]
         o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"]
-        o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"]
-        o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"]
         o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"]
         o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"]
         o["Other_Bottlenecks"].LCP = o["LCP"]
-        o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"]
-        o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
         o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"]
-        o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
         o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"]
         o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"]
-        o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
-        o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
-        o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
+        o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
@@ -6059,6 +6196,11 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6134,5 +6276,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/snb_client_ratios.py b/snb_client_ratios.py
index 6c6c4cbf..a78c893b 100644
--- a/snb_client_ratios.py
+++ b/snb_client_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel 2nd gen Core (code named SandyBridge)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 2nd gen Core (code named SandyBridge)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 
 
 def handle_error(obj, msg):
@@ -45,6 +48,8 @@ def handle_error_metric(obj, msg):
 Pipeline_Width = 4
 OneMillion = 1000000
 OneBillion = 1000000000
+EBS_Mode = 0
+DS = 0
 
 # Aux. formulas
 
@@ -161,13 +166,13 @@ def CoreIPC(self, EV, level):
 def FLOPc(self, EV, level):
     return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level)
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_DISPATCHED.THREAD", level) / Execute_Cycles(self, EV, level)
 
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
-    return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
+    return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level)
 
 # Total number of retired Instructions
 def Instructions(self, EV, level):
@@ -185,17 +190,17 @@ def DSB_Coverage(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -249,7 +254,7 @@ class Frontend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -311,7 +316,7 @@ class ITLB_Misses:
     sample = ['ITLB_MISSES.WALK_COMPLETED']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -515,7 +520,7 @@ class Branch_Mispredicts:
     sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -546,7 +551,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -576,7 +581,7 @@ class Backend_Bound:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -638,8 +643,8 @@ class DTLB_Load:
     sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4)
@@ -718,7 +723,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -756,7 +761,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -843,8 +848,8 @@ class Divider:
     sample = ['ARITH.FPU_DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3)
@@ -902,7 +907,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.RETIRE_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1344,7 +1349,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_CORE_CLKS:
@@ -1432,7 +1438,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -1451,7 +1457,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -1505,7 +1511,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
diff --git a/spr_max_server_ratios.py b/spr_max_server_ratios.py
index ef9a25be..8687f88b 100644
--- a/spr_max_server_ratios.py
+++ b/spr_max_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 4th gen MAX (code name Sapphire Rapids)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 4th gen MAX (code name Sapphire Rapids)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,11 +16,14 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 1
 Average_Frequency = 0.0
-
+num_cores = 1
+num_threads = 1
+num_sockets = 1
+topdown_use_fixed = False
 
 def handle_error(obj, msg):
     print_error(msg)
@@ -46,7 +49,13 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
-Errata_Whitelist = "ADL038"
+Errata_Whitelist = "ADL038;ADL066"
+Memory = 2
+PMM_App_Direct = 1 if Memory == 1 else 0
+HBM = 1 if Memory > 1 else 0
+PERF_METRICS_MSR = 1
+FP16 = 1
+DS = 1
 
 # Aux. formulas
 
@@ -55,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.AMX_Busy.compute(EV) + self.Divider.compute(EV))
@@ -100,11 +109,15 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED.SCALAR", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    EV("FP_ARITH_INST_RETIRED.VECTOR", level)
+    EV("FP_ARITH_INST_RETIRED2.VECTOR", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -120,19 +133,24 @@ def LOAD_LCL_MEM(self, EV, level):
     return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
 
 def LOAD_LCL_PMM(self, EV, level):
-    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_RETIRED.LOCAL_PMM", level)
+    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_FWD(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_HITM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_MEM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_PMM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_XSNP_HIT(self, EV, level):
     return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", level) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", level) * (1 - True_XSNP_HitM_Fraction(self, EV, level))
@@ -147,7 +165,7 @@ def MEM_Bound_Ratio(self, EV, level):
     return EV("MEMORY_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level)
 
 def Mem_DDR_Hit_Fraction(self, EV, level):
-    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level)))
+    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1
 
 def Mem_Lock_St_Fraction(self, EV, level):
     return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level)
@@ -180,16 +198,16 @@ def Mem_XSNP_None_Cost(self, EV, level):
     return 12 * Core_Frequency(self, EV, level)
 
 def Mem_Local_DRAM_Cost(self, EV, level):
-    return 108 * Core_Frequency(self, EV, level)
+    return 109 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_DRAM_Cost(self, EV, level):
-    return 186 * Core_Frequency(self, EV, level)
+    return 190 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_HitM_Cost(self, EV, level):
-    return 172.5 * Core_Frequency(self, EV, level)
+    return 170 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_Fwd_Cost(self, EV, level):
-    return 172.5 * Core_Frequency(self, EV, level)
+    return 170 * Core_Frequency(self, EV, level)
 
 def Mem_L2_Hit_Cost(self, EV, level):
     return 3 * Core_Frequency(self, EV, level)
@@ -206,16 +224,11 @@ def Retired_Slots(self, EV, level):
 
 # Number of logical processors (enabled or online) on the target system
 def Num_CPUs(self, EV, level):
-    return 24 if smt_enabled else 16
-
-def Memory(self, EV, level):
-    return 3 if HBM_Only(self, EV, level) else 2
-
-def PMM_App_Direct(self, EV, level):
-    return 1 if Memory(self, EV, level)== 1 else 0
+    return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled))
 
-def HBM_Only(self, EV, level):
-    return 0
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -229,7 +242,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -237,23 +250,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))) if HBM_Only(self, EV, level) else(100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))))
+    val = (100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = (100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) if HBM_Only(self, EV, level) else 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -271,24 +284,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -299,7 +318,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -315,7 +334,7 @@ def CLKS(self, EV, level):
 
 # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)
 def SLOTS(self, EV, level):
-    return EV("TOPDOWN.SLOTS", level)
+    return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level)
 
 # Fraction of Physical Core issue-slots utilized by this Logical Processor
 def Slots_Utilization(self, EV, level):
@@ -337,7 +356,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -345,12 +364,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -375,11 +388,11 @@ def IpBranch(self, EV, level):
 
 # Instructions per (near) call (lower number means higher occurrence rate)
 def IpCall(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level)
+    val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level)
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -395,51 +408,49 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_HP(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("INST_RETIRED.ANY", level)
+    val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else 0
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level))
+    EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)
+    EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level)
+    EV("INST_RETIRED.ANY", level)
+    EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)
+    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level))
-    self.thresh = (val < 10)
-    return val
-
-# Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions.
-def IpArith_AMX_F16(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.BF16", level)
-    self.thresh = (val < 10)
-    return val
-
-# Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions.
-def IpArith_AMX_Int8(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.INT8", level)
+    EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level)
+    EV("INST_RETIRED.ANY", level)
+    EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)
+    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
@@ -473,10 +484,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -501,6 +523,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level)
@@ -535,25 +563,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -563,7 +591,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -615,6 +643,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -631,19 +663,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -716,7 +744,7 @@ def Offcore_Read_L3M_PKI(self, EV, level):
 # High-Bandwidth Memory (HBM) accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)
 def Offcore_Read_HBM_PKI(self, EV, level):
     EV("OCR.DEMAND_DATA_RD.PMM", level)
-    return Offcore_Read_L3M_PKI(self, EV, level) if Memory(self, EV, level)> 2 else 1000 * EV("OCR.DEMAND_DATA_RD.PMM", level) / Instructions(self, EV, level)
+    return Offcore_Read_L3M_PKI(self, EV, level) if Memory > 2 else 1000 * EV("OCR.DEMAND_DATA_RD.PMM", level) / Instructions(self, EV, level)
 
 # Off-core accesses per kilo instruction for modified write requests
 def Offcore_MWrite_Any_PKI(self, EV, level):
@@ -732,11 +760,11 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -746,18 +774,10 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
-# Giga Integer (matrix) Operations Per Second
-def GIOPs(self, EV, level):
-    return 8 * EV("AMX_OPS_RETIRED.INT8", level) / 1e9 / Time(self, EV, level)
-
-# Tera Integer (matrix) Operations Per Second
-def TIOPs(self, EV, level):
-    return GIOPs(self, EV, level) / 1000
-
 # Average Frequency Utilization relative nominal frequency
 def Turbo_Utilization(self, EV, level):
     return CLKS(self, EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
@@ -801,7 +821,7 @@ def R2C_DRAM_BW(self, EV, level):
 # Average HBM BW for Reads-to-Core. See R2C_Offcore_BW.
 def R2C_HBM_BW(self, EV, level):
     EV("OCR.DEMAND_DATA_RD.PMM", level)
-    return R2C_DRAM_BW(self, EV, level) if Memory(self, EV, level)> 2 else 64 * EV("OCR.DEMAND_DATA_RD.PMM", level) / 1e9 / Time(self, EV, level)
+    return R2C_DRAM_BW(self, EV, level) if Memory > 2 else 64 * EV("OCR.DEMAND_DATA_RD.PMM", level) / 1e9 / Time(self, EV, level)
 
 # Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. memory-controller only
 def MEM_Read_Latency(self, EV, level):
@@ -813,7 +833,9 @@ def MEM_Parallel_Reads(self, EV, level):
 
 # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_PMM_Read_Latency(self, EV, level):
-    return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level))
+    EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level)
+    EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)
+    return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0
 
 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_DRAM_Read_Latency(self, EV, level):
@@ -821,11 +843,11 @@ def MEM_DRAM_Read_Latency(self, EV, level):
 
 # Average 3DXP Memory Bandwidth Use for reads [GB / sec]
 def PMM_Read_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec]
 def PMM_Write_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU
 def IO_Read_BW(self, EV, level):
@@ -867,11 +889,11 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.15)
         except ZeroDivisionError:
             handle_error(self, "Frontend_Bound zero division")
@@ -906,7 +928,7 @@ class Fetch_Latency:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2))
+            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Fetch_Latency zero division")
@@ -929,7 +951,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -954,7 +976,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1011,7 +1033,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1059,7 +1081,7 @@ class Unknown_Branches:
     sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1082,7 +1104,7 @@ class MS_Switches:
     area = "FE"
     level = 3
     htoff = False
-    sample = ['IDQ.MS_SWITCHES']
+    sample = ['FRONTEND_RETIRED.MS_FLOWS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['FetchLat', 'MicroSeq'])
@@ -1313,11 +1335,11 @@ class Branch_Mispredicts:
     sample = ['TOPDOWN.BR_MISPREDICT_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Branch_Mispredicts zero division")
@@ -1344,7 +1366,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1368,7 +1390,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1398,7 +1420,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1422,11 +1444,11 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             handle_error(self, "Backend_Bound zero division")
@@ -1458,7 +1480,7 @@ class Memory_Bound:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Memory_Bound zero division")
@@ -1514,8 +1536,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1546,7 +1568,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1570,7 +1592,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1615,13 +1637,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1674,7 +1721,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1703,7 +1750,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1753,7 +1800,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1781,7 +1828,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1808,7 +1855,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1836,7 +1883,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1864,9 +1911,7 @@ class HBM_Bound:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = MEM_Bound_Ratio(self, EV, 3) if HBM_Only(self, EV, 3) else MEM_Bound_Ratio(self, EV, 3) * EV("OCR.DEMAND_DATA_RD.PMM", 3) / EV("OCR.READS_TO_CORE.L3_MISS", 3)
-            EV("OCR.READS_TO_CORE.L3_MISS", 3)
-            EV("OCR.DEMAND_DATA_RD.PMM", 3)
+            self.val = MEM_Bound_Ratio(self, EV, 3) * EV("OCR.DEMAND_DATA_RD.PMM", 3) / EV("OCR.READS_TO_CORE.L3_MISS", 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "HBM_Bound zero division")
@@ -1889,7 +1934,7 @@ class DRAM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = self.HBM_Bound.compute(EV) if HBM_Only(self, EV, 3) else MEM_Bound_Ratio(self, EV, 3) - self.HBM_Bound.compute(EV)
+            self.val = MEM_Bound_Ratio(self, EV, 3) - self.HBM_Bound.compute(EV)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "DRAM_Bound zero division")
@@ -1909,7 +1954,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1971,7 +2016,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1997,7 +2042,7 @@ class Local_MEM:
     area = "BE/Mem"
     level = 5
     htoff = False
-    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Server'])
@@ -2028,7 +2073,8 @@ class Remote_MEM:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_MEM zero division")
@@ -2052,7 +2098,9 @@ class Remote_Cache:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_Cache zero division")
@@ -2077,10 +2125,10 @@ class PMM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 )
-            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
+            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0
             EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3)
             EV("MEM_LOAD_RETIRED.L1_MISS", 3)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "PMM_Bound zero division")
@@ -2129,7 +2177,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2157,7 +2205,7 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2212,7 +2260,8 @@ class Streaming_Stores:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -2236,7 +2285,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2344,8 +2393,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2370,7 +2419,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2489,7 +2538,7 @@ class AMX_Busy:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Compute', 'HPC', 'Server'])
+    metricgroup = frozenset(['BvCB', 'Compute', 'HPC', 'Server'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2553,7 +2602,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
+            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2663,7 +2712,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2826,11 +2875,11 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh
         except ZeroDivisionError:
             handle_error(self, "Retiring zero division")
@@ -2900,7 +2949,7 @@ class FP_Arith:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) + self.FP_AMX.compute(EV)
+            self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Arith zero division")
@@ -3003,7 +3052,10 @@ class FP_Vector_128b:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5)
+            self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
+            EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)
+            EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5)
+            EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Vector_128b zero division")
@@ -3028,7 +3080,10 @@ class FP_Vector_256b:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5)
+            self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
+            EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)
+            EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5)
+            EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Vector_256b zero division")
@@ -3040,32 +3095,6 @@ def compute(self, EV):
 length"""
 
 
-class FP_AMX:
-    name = "FP_AMX"
-    domain = "Uops_Estimated"
-    area = "RET"
-    level = 4
-    htoff = False
-    sample = []
-    errcount = 0
-    sibling = None
-    metricgroup = frozenset(['Compute', 'Flops', 'HPC', 'Pipeline', 'Server'])
-    maxval = None
-    def compute(self, EV):
-        try:
-            self.val = EV("AMX_OPS_RETIRED.BF16:c1", 4) / Retired_Slots(self, EV, 4)
-            self.thresh = (self.val > 0.1) and self.parent.thresh
-        except ZeroDivisionError:
-            handle_error(self, "FP_AMX zero division")
-        return self.val
-    desc = """
-This metric approximates arithmetic floating-point (FP)
-matrix uops fraction the CPU has retired (aggregated across
-all supported FP datatypes in AMX engine). Refer to AMX_Busy
-and GFLOPs metrics for actual AMX utilization and FP
-performance, resp."""
-
-
 class Int_Operations:
     name = "Int_Operations"
     domain = "Uops"
@@ -3135,35 +3164,9 @@ def compute(self, EV):
             handle_error(self, "Int_Vector_256b zero division")
         return self.val
     desc = """
-This metric represents 256-bit vector Integer ADD/SUB/SAD or
-VNNI (Vector Neural Network Instructions) uops fraction the
-CPU has retired."""
-
-
-class Int_AMX:
-    name = "Int_AMX"
-    domain = "Uops_Estimated"
-    area = "RET"
-    level = 4
-    htoff = False
-    sample = []
-    errcount = 0
-    sibling = None
-    metricgroup = frozenset(['Compute', 'HPC', 'IntVector', 'Pipeline', 'Server'])
-    maxval = None
-    def compute(self, EV):
-        try:
-            self.val = EV("AMX_OPS_RETIRED.INT8:c1", 4) / Retired_Slots(self, EV, 4)
-            self.thresh = (self.val > 0.1) and self.parent.thresh
-        except ZeroDivisionError:
-            handle_error(self, "Int_AMX zero division")
-        return self.val
-    desc = """
-This metric approximates arithmetic Integer (Int) matrix
-uops fraction the CPU has retired (aggregated across all
-supported Int datatypes in AMX engine). Refer to AMX_Busy
-and TIOPs metrics for actual AMX utilization and Int
-performance, resp."""
+This metric represents 256-bit vector Integer
+ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions)
+uops fraction the CPU has retired."""
 
 
 class Memory_Operations:
@@ -3199,7 +3202,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3211,8 +3214,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -3225,7 +3228,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3275,7 +3278,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3323,14 +3326,14 @@ class Heavy_Operations:
     area = "RET"
     level = 2
     htoff = False
-    sample = []
+    sample = ['UOPS_RETIRED.HEAVY']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Retire', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
             handle_error(self, "Heavy_Operations zero division")
@@ -3405,7 +3408,7 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3518,7 +3521,7 @@ class CISC:
     area = "RET"
     level = 4
     htoff = False
-    sample = []
+    sample = ['FRONTEND_RETIRED.MS_FLOWS']
     errcount = 0
     sibling = None
     metricgroup = frozenset([])
@@ -3546,7 +3549,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3566,7 +3569,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3587,7 +3590,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3598,7 +3601,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3607,7 +3611,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3627,7 +3631,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3647,7 +3651,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3667,7 +3671,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3677,7 +3681,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3688,7 +3692,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3709,7 +3713,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3731,7 +3735,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3741,10 +3745,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3753,7 +3756,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3763,31 +3766,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3842,7 +3868,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -4024,7 +4050,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -4046,26 +4073,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -4182,7 +4189,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -4244,8 +4251,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_HP:
@@ -4265,8 +4273,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_HP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Half-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -4286,8 +4295,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -4307,8 +4317,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -4328,8 +4339,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -4349,50 +4360,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
-
-
-class Metric_IpArith_AMX_F16:
-    name = "IpArith_AMX_F16"
-    domain = "Inst_Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.Inst_Mix"
-    metricgroup = frozenset(['Flops', 'FpVector', 'InsType', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = IpArith_AMX_F16(self, EV, 0)
-            self.thresh = (self.val < 10)
-        except ZeroDivisionError:
-            handle_error_metric(self, "IpArith_AMX_F16 zero division")
-    desc = """
-Instructions per FP Arithmetic AMX operation (lower number
-means higher occurrence rate). Operations factored per
-matrices' sizes of the AMX instructions."""
-
-
-class Metric_IpArith_AMX_Int8:
-    name = "IpArith_AMX_Int8"
-    domain = "Inst_Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.Inst_Mix"
-    metricgroup = frozenset(['IntVector', 'InsType', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = IpArith_AMX_Int8(self, EV, 0)
-            self.thresh = (self.val < 10)
-        except ZeroDivisionError:
-            handle_error_metric(self, "IpArith_AMX_Int8 zero division")
-    desc = """
-Instructions per Integer Arithmetic AMX operation (lower
-number means higher occurrence rate). Operations factored
-per matrices' sizes of the AMX instructions."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -4518,7 +4487,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4532,8 +4501,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4639,6 +4664,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4794,7 +4839,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4814,7 +4859,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4834,7 +4879,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4854,7 +4899,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4895,7 +4940,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -5136,6 +5181,26 @@ def compute(self, EV):
 loads  (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -5233,8 +5298,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -5253,8 +5317,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -5273,8 +5336,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -5293,8 +5355,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5699,7 +5760,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5718,7 +5779,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5791,45 +5852,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
-
-
-class Metric_GIOPs:
-    name = "GIOPs"
-    domain = "Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.System"
-    metricgroup = frozenset(['Cor', 'HPC', 'IntVector', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = GIOPs(self, EV, 0)
-            self.thresh = True
-        except ZeroDivisionError:
-            handle_error_metric(self, "GIOPs zero division")
-    desc = """
-Giga Integer (matrix) Operations Per Second"""
-
-
-class Metric_TIOPs:
-    name = "TIOPs"
-    domain = "Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.System"
-    metricgroup = frozenset(['HPC', 'IntVector', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = TIOPs(self, EV, 0)
-            self.thresh = True
-        except ZeroDivisionError:
-            handle_error_metric(self, "TIOPs zero division")
-    desc = """
-Tera Integer (matrix) Operations Per Second"""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -6312,6 +6335,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = FB_Full() ; r.run(n) ; o["FB_Full"] = n
@@ -6366,11 +6390,9 @@ def __init__(self, r):
         n = FP_Vector() ; r.run(n) ; o["FP_Vector"] = n
         n = FP_Vector_128b() ; r.run(n) ; o["FP_Vector_128b"] = n
         n = FP_Vector_256b() ; r.run(n) ; o["FP_Vector_256b"] = n
-        n = FP_AMX() ; r.run(n) ; o["FP_AMX"] = n
         n = Int_Operations() ; r.run(n) ; o["Int_Operations"] = n
         n = Int_Vector_128b() ; r.run(n) ; o["Int_Vector_128b"] = n
         n = Int_Vector_256b() ; r.run(n) ; o["Int_Vector_256b"] = n
-        n = Int_AMX() ; r.run(n) ; o["Int_AMX"] = n
         n = Memory_Operations() ; r.run(n) ; o["Memory_Operations"] = n
         n = Fused_Instructions() ; r.run(n) ; o["Fused_Instructions"] = n
         n = Non_Fused_Branches() ; r.run(n) ; o["Non_Fused_Branches"] = n
@@ -6412,6 +6434,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["FB_Full"].parent = o["L1_Bound"]
@@ -6465,11 +6488,9 @@ def __init__(self, r):
         o["FP_Vector"].parent = o["FP_Arith"]
         o["FP_Vector_128b"].parent = o["FP_Vector"]
         o["FP_Vector_256b"].parent = o["FP_Vector"]
-        o["FP_AMX"].parent = o["FP_Arith"]
         o["Int_Operations"].parent = o["Light_Operations"]
         o["Int_Vector_128b"].parent = o["Int_Operations"]
         o["Int_Vector_256b"].parent = o["Int_Operations"]
-        o["Int_AMX"].parent = o["Int_Operations"]
         o["Memory_Operations"].parent = o["Light_Operations"]
         o["Fused_Instructions"].parent = o["Light_Operations"]
         o["Non_Fused_Branches"].parent = o["Light_Operations"]
@@ -6498,7 +6519,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -6512,7 +6534,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -6527,8 +6548,6 @@ def __init__(self, r):
         n = Metric_IpArith_Scalar_DP() ; r.metric(n) ; o["IpArith_Scalar_DP"] = n
         n = Metric_IpArith_AVX128() ; r.metric(n) ; o["IpArith_AVX128"] = n
         n = Metric_IpArith_AVX256() ; r.metric(n) ; o["IpArith_AVX256"] = n
-        n = Metric_IpArith_AMX_F16() ; r.metric(n) ; o["IpArith_AMX_F16"] = n
-        n = Metric_IpArith_AMX_Int8() ; r.metric(n) ; o["IpArith_AMX_Int8"] = n
         n = Metric_IpPause() ; r.metric(n) ; o["IpPause"] = n
         n = Metric_IpSWPF() ; r.metric(n) ; o["IpSWPF"] = n
         n = Metric_Instructions() ; r.metric(n) ; o["Instructions"] = n
@@ -6536,11 +6555,15 @@ def __init__(self, r):
         n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -6566,6 +6589,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -6599,8 +6623,6 @@ def __init__(self, r):
         n = Metric_Core_Frequency() ; r.metric(n) ; o["Core_Frequency"] = n
         n = Metric_Uncore_Frequency() ; r.metric(n) ; o["Uncore_Frequency"] = n
         n = Metric_GFLOPs() ; r.metric(n) ; o["GFLOPs"] = n
-        n = Metric_GIOPs() ; r.metric(n) ; o["GIOPs"] = n
-        n = Metric_TIOPs() ; r.metric(n) ; o["TIOPs"] = n
         n = Metric_Turbo_Utilization() ; r.metric(n) ; o["Turbo_Utilization"] = n
         n = Metric_SMT_2T_Utilization() ; r.metric(n) ; o["SMT_2T_Utilization"] = n
         n = Metric_Kernel_Utilization() ; r.metric(n) ; o["Kernel_Utilization"] = n
@@ -6668,23 +6690,20 @@ def __init__(self, r):
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
-        o["FP_Arith"].FP_Vector = o["FP_Vector"]
-        o["FP_Arith"].FP_Scalar = o["FP_Scalar"]
-        o["FP_Arith"].FP_AMX = o["FP_AMX"]
         o["FP_Arith"].Retiring = o["Retiring"]
+        o["FP_Arith"].FP_Scalar = o["FP_Scalar"]
         o["FP_Arith"].X87_Use = o["X87_Use"]
+        o["FP_Arith"].FP_Vector = o["FP_Vector"]
         o["X87_Use"].Retiring = o["Retiring"]
         o["FP_Scalar"].Retiring = o["Retiring"]
         o["FP_Vector"].Retiring = o["Retiring"]
         o["FP_Vector_128b"].Retiring = o["Retiring"]
         o["FP_Vector_256b"].Retiring = o["Retiring"]
-        o["FP_AMX"].Retiring = o["Retiring"]
         o["Int_Operations"].Retiring = o["Retiring"]
         o["Int_Operations"].Int_Vector_256b = o["Int_Vector_256b"]
         o["Int_Operations"].Int_Vector_128b = o["Int_Vector_128b"]
         o["Int_Vector_128b"].Retiring = o["Retiring"]
         o["Int_Vector_256b"].Retiring = o["Retiring"]
-        o["Int_AMX"].Retiring = o["Retiring"]
         o["Memory_Operations"].Retiring = o["Retiring"]
         o["Memory_Operations"].Light_Operations = o["Light_Operations"]
         o["Memory_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -6695,17 +6714,16 @@ def __init__(self, r):
         o["Non_Fused_Branches"].Light_Operations = o["Light_Operations"]
         o["Non_Fused_Branches"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Light_Ops"].Light_Operations = o["Light_Operations"]
-        o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"]
         o["Other_Light_Ops"].Retiring = o["Retiring"]
         o["Other_Light_Ops"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Light_Ops"].Int_Operations = o["Int_Operations"]
         o["Other_Light_Ops"].Non_Fused_Branches = o["Non_Fused_Branches"]
-        o["Other_Light_Ops"].FP_AMX = o["FP_AMX"]
+        o["Other_Light_Ops"].FP_Arith = o["FP_Arith"]
         o["Other_Light_Ops"].Fused_Instructions = o["Fused_Instructions"]
         o["Other_Light_Ops"].Int_Vector_128b = o["Int_Vector_128b"]
         o["Other_Light_Ops"].FP_Vector = o["FP_Vector"]
+        o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"]
         o["Other_Light_Ops"].X87_Use = o["X87_Use"]
-        o["Other_Light_Ops"].FP_Arith = o["FP_Arith"]
         o["Other_Light_Ops"].Int_Vector_256b = o["Int_Vector_256b"]
         o["Other_Light_Ops"].Memory_Operations = o["Memory_Operations"]
         o["Nop_Instructions"].Retiring = o["Retiring"]
@@ -6764,16 +6782,17 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"]
-        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
@@ -6799,13 +6818,13 @@ def __init__(self, r):
         o["Cache_Memory_Latency"].HBM_Bound = o["HBM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -6814,6 +6833,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].HBM_Bound = o["HBM_Bound"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Synchronization"].L1_Bound = o["L1_Bound"]
         o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"]
@@ -6894,7 +6914,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
         o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
-        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
@@ -6914,6 +6934,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
+        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
         o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
         o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
         o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
@@ -6938,19 +6959,19 @@ def __init__(self, r):
         o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
         o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
         o["Other_Bottlenecks"].HBM_Bound = o["HBM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].MITE = o["MITE"]
         o["DSB_Misses"].LCP = o["LCP"]
@@ -6964,6 +6985,11 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -7041,5 +7067,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/spr_server_ratios.py b/spr_server_ratios.py
index af9842ce..08631bba 100644
--- a/spr_server_ratios.py
+++ b/spr_server_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 4th gen (code name Sapphire Rapids)
+# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 4th gen (code name Sapphire Rapids)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,11 +16,14 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 1
 Average_Frequency = 0.0
-
+num_cores = 1
+num_threads = 1
+num_sockets = 1
+topdown_use_fixed = False
 
 def handle_error(obj, msg):
     print_error(msg)
@@ -46,7 +49,13 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
-Errata_Whitelist = "SPR103"
+Errata_Whitelist = "SPR121;SPR103"
+Memory = 0
+PMM_App_Direct = 1 if Memory == 1 else 0
+HBM = 1 if Memory > 1 else 0
+PERF_METRICS_MSR = 1
+FP16 = 1
+DS = 1
 
 # Aux. formulas
 
@@ -55,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.AMX_Busy.compute(EV) + self.Divider.compute(EV))
@@ -96,15 +105,29 @@ def Few_Uops_Executed_Threshold(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FLOP_Count(self, EV, level):
-    return (1 *(EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level)) + 2 *(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level)) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 *(EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.8_FLOPS", level)) + 16 *(EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) + 32 * EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level) + 4 * EV("AMX_OPS_RETIRED.BF16", level))
+    EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.8_FLOPS", level)
+    EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)
+    EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level)
+    EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.4_FLOPS", level)
+    return (1 *(EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level)) + 2 *(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level)) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 *(EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.8_FLOPS", level)) + 16 *(EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) + 32 * EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)) if FP16 else(1 * EV("FP_ARITH_INST_RETIRED.SCALAR", level) + 2 * EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 * EV("FP_ARITH_INST_RETIRED.8_FLOPS", level) + 16 * EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED.SCALAR", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level)
+    EV("FP_ARITH_INST_RETIRED.VECTOR", level)
+    EV("FP_ARITH_INST_RETIRED2.VECTOR", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -120,19 +143,24 @@ def LOAD_LCL_MEM(self, EV, level):
     return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
 
 def LOAD_LCL_PMM(self, EV, level):
-    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_RETIRED.LOCAL_PMM", level)
+    return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_FWD(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_HITM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_MEM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_RMT_PMM(self, EV, level):
-    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level))
+    EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level)
+    return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0
 
 def LOAD_XSNP_HIT(self, EV, level):
     return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", level) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", level) * (1 - True_XSNP_HitM_Fraction(self, EV, level))
@@ -147,7 +175,7 @@ def MEM_Bound_Ratio(self, EV, level):
     return EV("MEMORY_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level)
 
 def Mem_DDR_Hit_Fraction(self, EV, level):
-    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level)))
+    return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1
 
 def Mem_Lock_St_Fraction(self, EV, level):
     return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level)
@@ -171,28 +199,28 @@ def True_XSNP_HitM_Fraction(self, EV, level):
     return EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", level) / (EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", level) + EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", level))
 
 def Mem_XSNP_HitM_Cost(self, EV, level):
-    return 80 * Core_Frequency(self, EV, level)
+    return 81 * Core_Frequency(self, EV, level)
 
 def Mem_XSNP_Hit_Cost(self, EV, level):
-    return 79.5 * Core_Frequency(self, EV, level)
+    return 79 * Core_Frequency(self, EV, level)
 
 def Mem_XSNP_None_Cost(self, EV, level):
     return 37 * Core_Frequency(self, EV, level)
 
 def Mem_Local_DRAM_Cost(self, EV, level):
-    return 108 * Core_Frequency(self, EV, level)
+    return 109 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_DRAM_Cost(self, EV, level):
-    return 186 * Core_Frequency(self, EV, level)
+    return 190 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_HitM_Cost(self, EV, level):
-    return 172.5 * Core_Frequency(self, EV, level)
+    return 170 * Core_Frequency(self, EV, level)
 
 def Mem_Remote_Fwd_Cost(self, EV, level):
-    return 172.5 * Core_Frequency(self, EV, level)
+    return 170 * Core_Frequency(self, EV, level)
 
 def Mem_L2_Hit_Cost(self, EV, level):
-    return 4 * Core_Frequency(self, EV, level)
+    return 4.4 * Core_Frequency(self, EV, level)
 
 def PERF_METRICS_SUM(self, EV, level):
     return (EV("PERF_METRICS.FRONTEND_BOUND", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.BAD_SPECULATION", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.RETIRING", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.BACKEND_BOUND", level) / EV("TOPDOWN.SLOTS", level))
@@ -206,13 +234,11 @@ def Retired_Slots(self, EV, level):
 
 # Number of logical processors (enabled or online) on the target system
 def Num_CPUs(self, EV, level):
-    return 224 if smt_enabled else 112
-
-def Memory(self, EV, level):
-    return 1
+    return num_cores * num_sockets * num_threads if num_cores else 224 /(2 - smt_enabled )
 
-def PMM_App_Direct(self, EV, level):
-    return 1 if Memory(self, EV, level)== 1 else 0
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -226,7 +252,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -234,23 +260,23 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors)
+# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)
 def Memory_Synchronization(self, EV, level):
     val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV))))
     self.thresh = (val > 10)
@@ -268,24 +294,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -296,7 +328,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -312,7 +344,7 @@ def CLKS(self, EV, level):
 
 # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)
 def SLOTS(self, EV, level):
-    return EV("TOPDOWN.SLOTS", level)
+    return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level)
 
 # Fraction of Physical Core issue-slots utilized by this Logical Processor
 def Slots_Utilization(self, EV, level):
@@ -334,7 +366,7 @@ def FLOPc(self, EV, level):
 def FP_Arith_Utilization(self, EV, level):
     return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level))
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor
+# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)
 def ILP(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level)
 
@@ -342,12 +374,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -372,11 +398,11 @@ def IpBranch(self, EV, level):
 
 # Instructions per (near) call (lower number means higher occurrence rate)
 def IpCall(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level)
+    val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level)
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -392,57 +418,59 @@ def IpFLOP(self, EV, level):
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.
+# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.
 def IpArith(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_HP(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("FP_ARITH_INST_RETIRED2.SCALAR", level)
+    EV("INST_RETIRED.ANY", level)
+    val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else 0
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_SP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_Scalar_DP(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level)
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX128(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level))
+    EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)
+    EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level)
+    EV("INST_RETIRED.ANY", level)
+    EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)
+    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX256(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level))
+    EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level)
+    EV("INST_RETIRED.ANY", level)
+    EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)
+    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
-# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.
+# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.
 def IpArith_AVX512(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level))
-    self.thresh = (val < 10)
-    return val
-
-# Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions.
-def IpArith_AMX_F16(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.BF16", level)
-    self.thresh = (val < 10)
-    return val
-
-# Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions.
-def IpArith_AMX_Int8(self, EV, level):
-    val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.INT8", level)
+    EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)
+    EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)
+    EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level)
+    EV("INST_RETIRED.ANY", level)
+    val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level))
     self.thresh = (val < 10)
     return val
 
@@ -476,10 +504,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -504,6 +543,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level)
@@ -538,25 +583,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -566,7 +611,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -618,6 +663,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -634,19 +683,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -730,11 +775,11 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
@@ -744,18 +789,10 @@ def Core_Frequency(self, EV, level):
 def Uncore_Frequency(self, EV, level):
     return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  and AMX engine.
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
-# Giga Integer (matrix) Operations Per Second
-def GIOPs(self, EV, level):
-    return 8 * EV("AMX_OPS_RETIRED.INT8", level) / 1e9 / Time(self, EV, level)
-
-# Tera Integer (matrix) Operations Per Second
-def TIOPs(self, EV, level):
-    return GIOPs(self, EV, level) / 1000
-
 # Average Frequency Utilization relative nominal frequency
 def Turbo_Utilization(self, EV, level):
     return CLKS(self, EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
@@ -806,7 +843,9 @@ def MEM_Parallel_Reads(self, EV, level):
 
 # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_PMM_Read_Latency(self, EV, level):
-    return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level))
+    EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level)
+    EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)
+    return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0
 
 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches
 def MEM_DRAM_Read_Latency(self, EV, level):
@@ -814,11 +853,11 @@ def MEM_DRAM_Read_Latency(self, EV, level):
 
 # Average 3DXP Memory Bandwidth Use for reads [GB / sec]
 def PMM_Read_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec]
 def PMM_Write_BW(self, EV, level):
-    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level))
+    return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0
 
 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU
 def IO_Read_BW(self, EV, level):
@@ -860,11 +899,11 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1)
+            self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.15)
         except ZeroDivisionError:
             handle_error(self, "Frontend_Bound zero division")
@@ -899,7 +938,7 @@ class Fetch_Latency:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2))
+            self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Fetch_Latency zero division")
@@ -922,7 +961,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -947,7 +986,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1004,7 +1043,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1052,7 +1091,7 @@ class Unknown_Branches:
     sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1306,11 +1345,11 @@ class Branch_Mispredicts:
     sample = ['TOPDOWN.BR_MISPREDICT_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Branch_Mispredicts zero division")
@@ -1337,7 +1376,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1361,7 +1400,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1391,7 +1430,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1415,11 +1454,11 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             handle_error(self, "Backend_Bound zero division")
@@ -1451,7 +1490,7 @@ class Memory_Bound:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Memory_Bound zero division")
@@ -1507,8 +1546,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
@@ -1539,7 +1578,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)
@@ -1563,7 +1602,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1608,13 +1647,38 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
@@ -1667,7 +1731,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1696,7 +1760,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1746,7 +1810,7 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1774,7 +1838,7 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1801,7 +1865,7 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -1829,7 +1893,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1877,7 +1941,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1939,7 +2003,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1965,7 +2029,7 @@ class Local_MEM:
     area = "BE/Mem"
     level = 5
     htoff = False
-    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Server'])
@@ -1996,7 +2060,8 @@ class Remote_MEM:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_MEM zero division")
@@ -2020,7 +2085,9 @@ class Remote_Cache:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5)
+            self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Remote_Cache zero division")
@@ -2045,10 +2112,10 @@ class PMM_Bound:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 )
-            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
+            self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0
             EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3)
             EV("MEM_LOAD_RETIRED.L1_MISS", 3)
+            EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "PMM_Bound zero division")
@@ -2097,7 +2164,7 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2125,7 +2192,7 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2180,7 +2247,8 @@ class Streaming_Stores:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -2204,7 +2272,7 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -2312,8 +2380,8 @@ class Divider:
     sample = ['ARITH.DIVIDER_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2338,7 +2406,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2457,7 +2525,7 @@ class AMX_Busy:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Compute', 'HPC', 'Server'])
+    metricgroup = frozenset(['BvCB', 'Compute', 'HPC', 'Server'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2521,7 +2589,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
+            self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2631,7 +2699,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2794,11 +2862,11 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1)
+            self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1)
             self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh
         except ZeroDivisionError:
             handle_error(self, "Retiring zero division")
@@ -2868,7 +2936,7 @@ class FP_Arith:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) + self.FP_AMX.compute(EV)
+            self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Arith zero division")
@@ -2971,7 +3039,10 @@ class FP_Vector_128b:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5)
+            self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
+            EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)
+            EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5)
+            EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Vector_128b zero division")
@@ -2996,7 +3067,10 @@ class FP_Vector_256b:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5)
+            self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
+            EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)
+            EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5)
+            EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Vector_256b zero division")
@@ -3021,7 +3095,10 @@ class FP_Vector_512b:
     maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5)
+            self.val = (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
+            EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5)
+            EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5)
+            EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "FP_Vector_512b zero division")
@@ -3032,32 +3109,6 @@ def compute(self, EV):
 due to FMA double counting."""
 
 
-class FP_AMX:
-    name = "FP_AMX"
-    domain = "Uops_Estimated"
-    area = "RET"
-    level = 4
-    htoff = False
-    sample = []
-    errcount = 0
-    sibling = None
-    metricgroup = frozenset(['Compute', 'Flops', 'HPC', 'Pipeline', 'Server'])
-    maxval = None
-    def compute(self, EV):
-        try:
-            self.val = EV("AMX_OPS_RETIRED.BF16:c1", 4) / Retired_Slots(self, EV, 4)
-            self.thresh = (self.val > 0.1) and self.parent.thresh
-        except ZeroDivisionError:
-            handle_error(self, "FP_AMX zero division")
-        return self.val
-    desc = """
-This metric approximates arithmetic floating-point (FP)
-matrix uops fraction the CPU has retired (aggregated across
-all supported FP datatypes in AMX engine). Refer to AMX_Busy
-and GFLOPs metrics for actual AMX utilization and FP
-performance, resp."""
-
-
 class Int_Operations:
     name = "Int_Operations"
     domain = "Uops"
@@ -3071,7 +3122,7 @@ class Int_Operations:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = self.Int_Vector_128b.compute(EV) + self.Int_Vector_256b.compute(EV) + self.Int_AMX.compute(EV)
+            self.val = self.Int_Vector_128b.compute(EV) + self.Int_Vector_256b.compute(EV)
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Int_Operations zero division")
@@ -3127,35 +3178,9 @@ def compute(self, EV):
             handle_error(self, "Int_Vector_256b zero division")
         return self.val
     desc = """
-This metric represents 256-bit vector Integer ADD/SUB/SAD or
-VNNI (Vector Neural Network Instructions) uops fraction the
-CPU has retired."""
-
-
-class Int_AMX:
-    name = "Int_AMX"
-    domain = "Uops_Estimated"
-    area = "RET"
-    level = 4
-    htoff = False
-    sample = []
-    errcount = 0
-    sibling = None
-    metricgroup = frozenset(['Compute', 'HPC', 'IntVector', 'Pipeline', 'Server'])
-    maxval = None
-    def compute(self, EV):
-        try:
-            self.val = EV("AMX_OPS_RETIRED.INT8:c1", 4) / Retired_Slots(self, EV, 4)
-            self.thresh = (self.val > 0.1) and self.parent.thresh
-        except ZeroDivisionError:
-            handle_error(self, "Int_AMX zero division")
-        return self.val
-    desc = """
-This metric approximates arithmetic Integer (Int) matrix
-uops fraction the CPU has retired (aggregated across all
-supported Int datatypes in AMX engine). Refer to AMX_Busy
-and TIOPs metrics for actual AMX utilization and Int
-performance, resp."""
+This metric represents 256-bit vector Integer
+ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions)
+uops fraction the CPU has retired."""
 
 
 class Memory_Operations:
@@ -3191,7 +3216,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3203,8 +3228,8 @@ def compute(self, EV):
     desc = """
 This metric represents fraction of slots where the CPU was
 retiring fused instructions -- where one uop can represent
-multiple contiguous instructions. The instruction pairs of
-CMP+JCC or DEC+JCC are commonly used examples.. See section
+multiple contiguous instructions. CMP+JCC or DEC+JCC are
+common examples of legacy fusions. {}. See section
 'Optimizing for Macro-fusion' in Optimization Manual:"""
 
 
@@ -3217,7 +3242,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3267,7 +3292,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3322,7 +3347,7 @@ class Heavy_Operations:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2)
+            self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2)
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
             handle_error(self, "Heavy_Operations zero division")
@@ -3397,7 +3422,7 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
+    metricgroup = frozenset(['BvIO'])
     maxval = 1.0
     def compute(self, EV):
         try:
@@ -3538,7 +3563,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3558,7 +3583,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3579,7 +3604,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3590,7 +3615,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3599,7 +3625,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3619,7 +3645,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3639,7 +3665,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3659,7 +3685,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3669,7 +3695,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Memory_Synchronization zero division")
     desc = """
-Total pipeline cost of Memory Synchornization related
+Total pipeline cost of Memory Synchronization related
 bottlenecks (data transfers and coherency updates across
 processors)"""
 
@@ -3680,7 +3706,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3701,7 +3727,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3723,7 +3749,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3733,10 +3759,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3745,7 +3770,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3755,31 +3780,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3834,7 +3882,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -4016,7 +4064,8 @@ def compute(self, EV):
             handle_error_metric(self, "ILP zero division")
     desc = """
 Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per logical-processor"""
+executed when there is execution) per thread (logical-
+processor)"""
 
 
 class Metric_EPC:
@@ -4038,26 +4087,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1.0
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -4174,7 +4203,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -4236,8 +4265,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith zero division")
     desc = """
 Instructions per FP Arithmetic instruction (lower number
-means higher occurrence rate). May undercount due to FMA
-double counting. Approximated prior to BDW."""
+means higher occurrence rate). Values < 1 are possible due
+to intentional FMA double counting. Approximated prior to
+BDW."""
 
 
 class Metric_IpArith_Scalar_HP:
@@ -4257,8 +4287,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_HP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Half-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_SP:
@@ -4278,8 +4309,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_SP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Single-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_Scalar_DP:
@@ -4299,8 +4331,9 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_Scalar_DP zero division")
     desc = """
 Instructions per FP Arithmetic Scalar Double-Precision
-instruction (lower number means higher occurrence rate). May
-undercount due to FMA double counting."""
+instruction (lower number means higher occurrence rate).
+Values < 1 are possible due to intentional FMA double
+counting."""
 
 
 class Metric_IpArith_AVX128:
@@ -4320,8 +4353,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX128 zero division")
     desc = """
 Instructions per FP Arithmetic AVX/SSE 128-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX256:
@@ -4341,8 +4374,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX256 zero division")
     desc = """
 Instructions per FP Arithmetic AVX* 256-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpArith_AVX512:
@@ -4362,50 +4395,8 @@ def compute(self, EV):
             handle_error_metric(self, "IpArith_AVX512 zero division")
     desc = """
 Instructions per FP Arithmetic AVX 512-bit instruction
-(lower number means higher occurrence rate). May undercount
-due to FMA double counting."""
-
-
-class Metric_IpArith_AMX_F16:
-    name = "IpArith_AMX_F16"
-    domain = "Inst_Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.Inst_Mix"
-    metricgroup = frozenset(['Flops', 'FpVector', 'InsType', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = IpArith_AMX_F16(self, EV, 0)
-            self.thresh = (self.val < 10)
-        except ZeroDivisionError:
-            handle_error_metric(self, "IpArith_AMX_F16 zero division")
-    desc = """
-Instructions per FP Arithmetic AMX operation (lower number
-means higher occurrence rate). Operations factored per
-matrices' sizes of the AMX instructions."""
-
-
-class Metric_IpArith_AMX_Int8:
-    name = "IpArith_AMX_Int8"
-    domain = "Inst_Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.Inst_Mix"
-    metricgroup = frozenset(['IntVector', 'InsType', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = IpArith_AMX_Int8(self, EV, 0)
-            self.thresh = (self.val < 10)
-        except ZeroDivisionError:
-            handle_error_metric(self, "IpArith_AMX_Int8 zero division")
-    desc = """
-Instructions per Integer Arithmetic AMX operation (lower
-number means higher occurrence rate). Operations factored
-per matrices' sizes of the AMX instructions."""
+(lower number means higher occurrence rate). Values < 1 are
+possible due to intentional FMA double counting."""
 
 
 class Metric_IpPause:
@@ -4531,7 +4522,7 @@ def compute(self, EV):
 
 class Metric_Execute:
     name = "Execute"
-    domain = "Core_Metric"
+    domain = "Metric"
     maxval = Exe_Ports
     errcount = 0
     area = "Info.Pipeline"
@@ -4545,8 +4536,64 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
@@ -4652,6 +4699,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4807,7 +4874,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4827,7 +4894,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4847,7 +4914,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4867,7 +4934,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4908,7 +4975,7 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
@@ -5149,6 +5216,26 @@ def compute(self, EV):
 loads  (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -5246,8 +5333,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -5266,8 +5352,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -5286,8 +5371,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -5306,8 +5390,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
@@ -5691,7 +5774,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5710,7 +5793,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5783,45 +5866,7 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  and AMX engine."""
-
-
-class Metric_GIOPs:
-    name = "GIOPs"
-    domain = "Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.System"
-    metricgroup = frozenset(['Cor', 'HPC', 'IntVector', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = GIOPs(self, EV, 0)
-            self.thresh = True
-        except ZeroDivisionError:
-            handle_error_metric(self, "GIOPs zero division")
-    desc = """
-Giga Integer (matrix) Operations Per Second"""
-
-
-class Metric_TIOPs:
-    name = "TIOPs"
-    domain = "Metric"
-    maxval = 0
-    errcount = 0
-    area = "Info.System"
-    metricgroup = frozenset(['HPC', 'IntVector', 'Server'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = TIOPs(self, EV, 0)
-            self.thresh = True
-        except ZeroDivisionError:
-            handle_error_metric(self, "TIOPs zero division")
-    desc = """
-Tera Integer (matrix) Operations Per Second"""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
@@ -6285,6 +6330,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = FB_Full() ; r.run(n) ; o["FB_Full"] = n
@@ -6339,11 +6385,9 @@ def __init__(self, r):
         n = FP_Vector_128b() ; r.run(n) ; o["FP_Vector_128b"] = n
         n = FP_Vector_256b() ; r.run(n) ; o["FP_Vector_256b"] = n
         n = FP_Vector_512b() ; r.run(n) ; o["FP_Vector_512b"] = n
-        n = FP_AMX() ; r.run(n) ; o["FP_AMX"] = n
         n = Int_Operations() ; r.run(n) ; o["Int_Operations"] = n
         n = Int_Vector_128b() ; r.run(n) ; o["Int_Vector_128b"] = n
         n = Int_Vector_256b() ; r.run(n) ; o["Int_Vector_256b"] = n
-        n = Int_AMX() ; r.run(n) ; o["Int_AMX"] = n
         n = Memory_Operations() ; r.run(n) ; o["Memory_Operations"] = n
         n = Fused_Instructions() ; r.run(n) ; o["Fused_Instructions"] = n
         n = Non_Fused_Branches() ; r.run(n) ; o["Non_Fused_Branches"] = n
@@ -6385,6 +6429,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["FB_Full"].parent = o["L1_Bound"]
@@ -6438,11 +6483,9 @@ def __init__(self, r):
         o["FP_Vector_128b"].parent = o["FP_Vector"]
         o["FP_Vector_256b"].parent = o["FP_Vector"]
         o["FP_Vector_512b"].parent = o["FP_Vector"]
-        o["FP_AMX"].parent = o["FP_Arith"]
         o["Int_Operations"].parent = o["Light_Operations"]
         o["Int_Vector_128b"].parent = o["Int_Operations"]
         o["Int_Vector_256b"].parent = o["Int_Operations"]
-        o["Int_AMX"].parent = o["Int_Operations"]
         o["Memory_Operations"].parent = o["Light_Operations"]
         o["Fused_Instructions"].parent = o["Light_Operations"]
         o["Non_Fused_Branches"].parent = o["Light_Operations"]
@@ -6471,7 +6514,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -6485,7 +6529,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -6501,8 +6544,6 @@ def __init__(self, r):
         n = Metric_IpArith_AVX128() ; r.metric(n) ; o["IpArith_AVX128"] = n
         n = Metric_IpArith_AVX256() ; r.metric(n) ; o["IpArith_AVX256"] = n
         n = Metric_IpArith_AVX512() ; r.metric(n) ; o["IpArith_AVX512"] = n
-        n = Metric_IpArith_AMX_F16() ; r.metric(n) ; o["IpArith_AMX_F16"] = n
-        n = Metric_IpArith_AMX_Int8() ; r.metric(n) ; o["IpArith_AMX_Int8"] = n
         n = Metric_IpPause() ; r.metric(n) ; o["IpPause"] = n
         n = Metric_IpSWPF() ; r.metric(n) ; o["IpSWPF"] = n
         n = Metric_Instructions() ; r.metric(n) ; o["Instructions"] = n
@@ -6510,11 +6551,15 @@ def __init__(self, r):
         n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -6540,6 +6585,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -6572,8 +6618,6 @@ def __init__(self, r):
         n = Metric_Core_Frequency() ; r.metric(n) ; o["Core_Frequency"] = n
         n = Metric_Uncore_Frequency() ; r.metric(n) ; o["Uncore_Frequency"] = n
         n = Metric_GFLOPs() ; r.metric(n) ; o["GFLOPs"] = n
-        n = Metric_GIOPs() ; r.metric(n) ; o["GIOPs"] = n
-        n = Metric_TIOPs() ; r.metric(n) ; o["TIOPs"] = n
         n = Metric_Turbo_Utilization() ; r.metric(n) ; o["Turbo_Utilization"] = n
         n = Metric_SMT_2T_Utilization() ; r.metric(n) ; o["SMT_2T_Utilization"] = n
         n = Metric_Kernel_Utilization() ; r.metric(n) ; o["Kernel_Utilization"] = n
@@ -6639,25 +6683,21 @@ def __init__(self, r):
         o["Retiring"].Heavy_Operations = o["Heavy_Operations"]
         o["Light_Operations"].Retiring = o["Retiring"]
         o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"]
-        o["FP_Arith"].FP_Vector = o["FP_Vector"]
-        o["FP_Arith"].FP_Scalar = o["FP_Scalar"]
-        o["FP_Arith"].FP_AMX = o["FP_AMX"]
         o["FP_Arith"].Retiring = o["Retiring"]
+        o["FP_Arith"].FP_Scalar = o["FP_Scalar"]
         o["FP_Arith"].X87_Use = o["X87_Use"]
+        o["FP_Arith"].FP_Vector = o["FP_Vector"]
         o["X87_Use"].Retiring = o["Retiring"]
         o["FP_Scalar"].Retiring = o["Retiring"]
         o["FP_Vector"].Retiring = o["Retiring"]
         o["FP_Vector_128b"].Retiring = o["Retiring"]
         o["FP_Vector_256b"].Retiring = o["Retiring"]
         o["FP_Vector_512b"].Retiring = o["Retiring"]
-        o["FP_AMX"].Retiring = o["Retiring"]
         o["Int_Operations"].Retiring = o["Retiring"]
-        o["Int_Operations"].Int_AMX = o["Int_AMX"]
         o["Int_Operations"].Int_Vector_256b = o["Int_Vector_256b"]
         o["Int_Operations"].Int_Vector_128b = o["Int_Vector_128b"]
         o["Int_Vector_128b"].Retiring = o["Retiring"]
         o["Int_Vector_256b"].Retiring = o["Retiring"]
-        o["Int_AMX"].Retiring = o["Retiring"]
         o["Memory_Operations"].Retiring = o["Retiring"]
         o["Memory_Operations"].Light_Operations = o["Light_Operations"]
         o["Memory_Operations"].Heavy_Operations = o["Heavy_Operations"]
@@ -6668,18 +6708,16 @@ def __init__(self, r):
         o["Non_Fused_Branches"].Light_Operations = o["Light_Operations"]
         o["Non_Fused_Branches"].Heavy_Operations = o["Heavy_Operations"]
         o["Other_Light_Ops"].Light_Operations = o["Light_Operations"]
-        o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"]
         o["Other_Light_Ops"].Retiring = o["Retiring"]
         o["Other_Light_Ops"].Heavy_Operations = o["Heavy_Operations"]
-        o["Other_Light_Ops"].Int_AMX = o["Int_AMX"]
         o["Other_Light_Ops"].Int_Operations = o["Int_Operations"]
         o["Other_Light_Ops"].Non_Fused_Branches = o["Non_Fused_Branches"]
-        o["Other_Light_Ops"].FP_AMX = o["FP_AMX"]
+        o["Other_Light_Ops"].FP_Arith = o["FP_Arith"]
         o["Other_Light_Ops"].Fused_Instructions = o["Fused_Instructions"]
         o["Other_Light_Ops"].Int_Vector_128b = o["Int_Vector_128b"]
         o["Other_Light_Ops"].FP_Vector = o["FP_Vector"]
+        o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"]
         o["Other_Light_Ops"].X87_Use = o["X87_Use"]
-        o["Other_Light_Ops"].FP_Arith = o["FP_Arith"]
         o["Other_Light_Ops"].Int_Vector_256b = o["Int_Vector_256b"]
         o["Other_Light_Ops"].Memory_Operations = o["Memory_Operations"]
         o["Nop_Instructions"].Retiring = o["Retiring"]
@@ -6737,46 +6775,53 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"]
-        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
         o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
-        o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
-        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
-        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
-        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
         o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
-        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
-        o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
+        o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"]
         o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -6785,6 +6830,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Synchronization"].L1_Bound = o["L1_Bound"]
         o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"]
@@ -6864,7 +6910,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
         o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
-        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
@@ -6884,6 +6930,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
+        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
         o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
         o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
         o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
@@ -6907,19 +6954,19 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
         o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
         o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].MITE = o["MITE"]
         o["DSB_Misses"].LCP = o["LCP"]
@@ -6933,6 +6980,11 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -7011,5 +7063,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)
diff --git a/tl-tester b/tl-tester
index ae774d16..64d44f15 100755
--- a/tl-tester
+++ b/tl-tester
@@ -15,7 +15,7 @@
 # NOCPUS=1  skip individual CPU tests
 # MATCH=cpu in cpu loops only run for CPU cpu
 # ONLYCPU=cpu  Only test cpu type. cur for current CPU (makes test suite run much faster)
-# NOMULTIPLEX=" " Disable no multiplex 
+# NOMULTIPLEX=" " Disable no multiplex
 # NATIVE_ARGS="" Pass argument to toplevs without --force-cpu (to override on unsupported hosts)
 
 set -e
@@ -121,6 +121,7 @@ SMTCPUS="${SMTCPUS:-snb jkt ivb ivt hsw hsx bdw skl bdx knl skx clx icl tgl icx
 NOSMTCPUS="${NOSMTCPUS:-slm simple ehl adl-grt}"
 ALLCPUS="${ALLCPUS:-$SMTCPUS $NOSMTCPUS}"
 fi
+METRICCPUS='icl|tgl|mtl-rwc|spr|sprmax|adl-glc|icx'
 
 ALL=--all
 
@@ -334,7 +335,8 @@ notfound $j
 fi
 fi
 
-if [ $j == "icl" -o $j == "tgl" ] ; then
+case "$j" in
+$METRICCPUS)
 
 FORCEMETRICS=1 $WRAP ./toplev.py --force-cpu $j --filterquals --force-topology topology --force-events --print $ALL $OPT $LOAD | tee log$$
 grep topdown- log$$
@@ -345,7 +347,9 @@ grep topdown- log$$
 grep :D log$$
 rm log$$
 
-fi # icl
+;;
+esac
+
 
 done
 
diff --git a/toplev.py b/toplev.py
index d6145650..3a6e4d3a 100755
--- a/toplev.py
+++ b/toplev.py
@@ -3925,6 +3925,9 @@ def init_model(model, runner, pe):
     model.print_error = pe
     model.check_event = lambda ev: ectx.emap.getevent(ev) is not None
     model.Setup(runner)
+    model.num_cores = len(cpu.coreids) # includes sockets
+    if cpu.ht:
+        model.num_threads = 2
 
     if "Errata_Whitelist" in model.__dict__:
         ectx.errata_whitelist += model.Errata_Whitelist.split(";")