From 9f7b475efb3b6c050fa1483c9d46195708976ff1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 2 Apr 2024 16:30:03 -0700 Subject: [PATCH] update toplev to TMA 4.8 * toplev updated to TMA 4.8: * Bottlenecks View: * Renamed Base_Non_Br to Useful_Work and simplified descriptions for all BV metrics. * Cache_Memory_Latency now accounts for L1 cache latency as well. * Improved Branching_Overhead accuracy for function calling and alignments * Cross-reference Bottlenecks w/ TMA tree for tool visualization (VTune request) * New Tree Nodes * L1_Hit_Latency: estimates fraction of cycles with demand load accesses that hit the L1 cache (relies on Dependent_Loads_Weight SystemParameter today) * New Informative Metrics * Fetch_LSD (client), Fetch_DSB, Fetch_MITE under Info.Pipeline group [SKL onwards] * DSB_Bandwidth under Info.Botlnk.L2 * L2MPKI_RFO under Info.Memory * Key Enhancements & fixes * Fixed Ports_Utilization/Ports_Utilized_0 * Slightly tuned memory (fixed cost) latencies [SPR, EMR] * Corrected CPU_Utilization, CPUs_Utilized for Linux perf based tools --- README.md | 17 + adl_glc_ratios.py | 568 ++++++++++++++++--------- bdw_client_ratios.py | 172 ++++---- bdx_server_ratios.py | 172 ++++---- clx_server_ratios.py | 591 ++++++++++++++++---------- hsw_client_ratios.py | 121 +++--- hsx_server_ratios.py | 121 +++--- icl_client_ratios.py | 608 +++++++++++++++++---------- icx_server_ratios.py | 669 +++++++++++++++++++----------- ivb_client_ratios.py | 138 +++--- ivb_server_ratios.py | 138 +++--- jkt_server_ratios.py | 52 ++- skl_client_ratios.py | 563 ++++++++++++++++--------- skx_server_ratios.py | 555 ++++++++++++++++--------- snb_client_ratios.py | 52 ++- spr_max_server_ratios.py | 829 +++++++++++++++++++------------------ spr_server_ratios.py | 875 +++++++++++++++++++++------------------ tl-tester | 10 +- toplev.py | 3 + 19 files changed, 3746 insertions(+), 2508 deletions(-) diff --git a/README.md b/README.md index 343cd24b..ba17a669 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,23 @@ on newer Linux kernels. # Recent new features: +## TMA 4.8 release +* toplev updated to TMA 4.8: + * Bottlenecks View: + * Renamed Base_Non_Br to Useful_Work and simplified descriptions for all BV metrics. + * Cache_Memory_Latency now accounts for L1 cache latency as well. + * Improved Branching_Overhead accuracy for function calling and alignments + * Cross-reference Bottlenecks w/ TMA tree for tool visualization (VTune request) + * New Tree Nodes + * L1_Hit_Latency: estimates fraction of cycles with demand load accesses that hit the L1 cache (relies on Dependent_Loads_Weight SystemParameter today) + * New Informative Metrics + * Fetch_LSD (client), Fetch_DSB, Fetch_MITE under Info.Pipeline group [SKL onwards] + * DSB_Bandwidth under Info.Botlnk.L2 + * L2MPKI_RFO under Info.Memory + * Key Enhancements & fixes + * Fixed Ports_Utilization/Ports_Utilized_0 + * Slightly tuned memory (fixed cost) latencies [SPR, EMR] + * Corrected CPU_Utilization, CPUs_Utilized for Linux perf based tools * toplev now supports Meteor Lake systems. * Add a new genretlat.py tool to tune the toplev model for a workload. The basic tuning needs to be generated before first toplev use using genretlat -o mtl-retlat.json ./workloads/BC1s (or suitable workload). toplev diff --git a/adl_glc_ratios.py b/adl_glc_ratios.py index e220b76e..794adfba 100644 --- a/adl_glc_ratios.py +++ b/adl_glc_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 12th gen Core (code name Alderlake) with Golden Cove +# auto generated TopDown/TMA 4.8-full-perf description for Intel 12th gen Core (code name Alderlake) with Golden Cove # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,11 +16,14 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 - +num_cores = 1 +num_threads = 1 +num_sockets = 1 +topdown_use_fixed = False def handle_error(obj, msg): print_error(msg) @@ -46,7 +49,9 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 -Errata_Whitelist = "ADL038" +Errata_Whitelist = "ADL038;ADL066" +PERF_METRICS_MSR = 1 +DS = 0 # Aux. formulas @@ -55,7 +60,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -100,11 +105,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -173,7 +178,11 @@ def Retired_Slots(self, EV, level): # Number of logical processors (enabled or online) on the target system def Num_CPUs(self, EV, level): - return 24 if smt_enabled else 16 + return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled)) + +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -187,7 +196,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -195,23 +204,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -229,24 +238,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -257,7 +272,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -273,7 +288,7 @@ def CLKS(self, EV, level): # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward) def SLOTS(self, EV, level): - return EV("TOPDOWN.SLOTS", level) + return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level) # Fraction of Physical Core issue-slots utilized by this Logical Processor def Slots_Utilization(self, EV, level): @@ -295,7 +310,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -303,12 +318,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -333,11 +342,11 @@ def IpBranch(self, EV, level): # Instructions per (near) call (lower number means higher occurrence rate) def IpCall(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level) + val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level) self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -353,31 +362,31 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -413,10 +422,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -445,6 +465,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level) @@ -479,25 +505,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -507,7 +533,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -559,6 +585,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -575,19 +605,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -651,17 +677,17 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -733,11 +759,11 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.15) except ZeroDivisionError: handle_error(self, "Frontend_Bound zero division") @@ -772,7 +798,7 @@ class Fetch_Latency: maxval = None def compute(self, EV): try: - self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) + self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Fetch_Latency zero division") @@ -795,7 +821,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -820,7 +846,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -877,7 +903,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -925,7 +951,7 @@ class Unknown_Branches: sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1206,11 +1232,11 @@ class Branch_Mispredicts: sample = ['TOPDOWN.BR_MISPREDICT_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Branch_Mispredicts zero division") @@ -1237,7 +1263,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1261,7 +1287,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1291,7 +1317,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1315,11 +1341,11 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.2) except ZeroDivisionError: handle_error(self, "Backend_Bound zero division") @@ -1351,7 +1377,7 @@ class Memory_Bound: maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Memory_Bound zero division") @@ -1407,8 +1433,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1439,7 +1465,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1463,7 +1489,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1508,13 +1534,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1567,7 +1618,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1596,7 +1647,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1646,7 +1697,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1674,7 +1725,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1701,7 +1752,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1729,7 +1780,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1777,7 +1828,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1815,7 +1866,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1871,7 +1922,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1899,7 +1950,7 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1954,7 +2005,8 @@ class Streaming_Stores: maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -1978,7 +2030,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2086,8 +2138,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2112,7 +2164,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2271,7 +2323,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) + self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2381,7 +2433,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2544,11 +2596,11 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh except ZeroDivisionError: handle_error(self, "Retiring zero division") @@ -2827,9 +2879,9 @@ def compute(self, EV): handle_error(self, "Int_Vector_256b zero division") return self.val desc = """ -This metric represents 256-bit vector Integer ADD/SUB/SAD or -VNNI (Vector Neural Network Instructions) uops fraction the -CPU has retired.""" +This metric represents 256-bit vector Integer +ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) +uops fraction the CPU has retired.""" class Memory_Operations: @@ -2865,7 +2917,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2877,8 +2929,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -2891,7 +2943,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2941,7 +2993,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2996,7 +3048,7 @@ class Heavy_Operations: maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) except ZeroDivisionError: handle_error(self, "Heavy_Operations zero division") @@ -3071,7 +3123,7 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3212,7 +3264,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3232,7 +3284,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3253,7 +3305,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3264,7 +3316,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3273,7 +3326,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3293,7 +3346,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3313,7 +3366,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3333,7 +3386,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3343,7 +3396,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3354,7 +3407,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3375,7 +3428,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3397,7 +3450,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3407,10 +3460,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3419,7 +3471,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3429,31 +3481,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3508,7 +3583,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3690,7 +3765,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3712,26 +3788,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3848,7 +3904,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3910,8 +3966,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3931,8 +3988,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3952,8 +4010,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -3973,8 +4032,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -3994,8 +4053,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -4121,7 +4180,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4135,8 +4194,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4262,6 +4377,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4417,7 +4552,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4437,7 +4572,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4457,7 +4592,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4477,7 +4612,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4518,7 +4653,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4759,6 +4894,26 @@ def compute(self, EV): loads (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -4856,8 +5011,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4876,8 +5030,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4896,8 +5049,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4916,8 +5068,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5200,7 +5351,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5219,7 +5370,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5273,7 +5424,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5553,6 +5704,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = FB_Full() ; r.run(n) ; o["FB_Full"] = n @@ -5645,6 +5797,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["FB_Full"].parent = o["L1_Bound"] @@ -5722,7 +5875,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5736,7 +5890,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5757,12 +5910,16 @@ def __init__(self, r): n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5788,6 +5945,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -5961,44 +6119,51 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] + o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] - o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] - o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] + o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] - o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] - o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] - o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -6006,6 +6171,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Synchronization"].L1_Bound = o["L1_Bound"] o["Memory_Synchronization"].Retiring = o["Retiring"] @@ -6081,7 +6247,7 @@ def __init__(self, r): o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] - o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] @@ -6101,6 +6267,7 @@ def __init__(self, r): o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] + o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] @@ -6123,19 +6290,19 @@ def __init__(self, r): o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].LSD = o["LSD"] o["DSB_Misses"].MITE = o["MITE"] @@ -6150,6 +6317,12 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].LSD = o["LSD"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6226,5 +6399,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/bdw_client_ratios.py b/bdw_client_ratios.py index 557c36b0..6a1e9d1a 100644 --- a/bdw_client_ratios.py +++ b/bdw_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 5th gen Core / Core M (code named Broadwell) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 5th gen Core / Core M (code named Broadwell) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -49,6 +52,8 @@ def handle_error_metric(obj, msg): OneBillion = 1000000000 Energy_Unit = 61 Errata_Whitelist = "BDE69;BDE70" +EBS_Mode = 0 +DS = 0 # Aux. formulas @@ -88,11 +93,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def Frontend_RS_Empty_Cycles(self, EV, level): EV("RS_EVENTS.EMPTY_CYCLES", level) @@ -183,7 +188,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -217,13 +222,13 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -249,7 +254,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -265,31 +270,31 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -303,7 +308,6 @@ def Instructions(self, EV, level): def Retire(self, EV, level): return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) @@ -323,7 +327,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -353,6 +357,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -365,15 +373,12 @@ def L2HPKI_Load(self, EV, level): def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -409,17 +414,17 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -477,7 +482,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -539,7 +544,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -564,7 +569,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -621,7 +626,7 @@ class Mispredicts_Resteers: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -669,7 +674,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -900,7 +905,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -931,7 +936,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -961,7 +966,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1053,8 +1058,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION:c1", 4) + 7 * EV("DTLB_LOAD_MISSES.WALK_COMPLETED", 4)) / CLKS(self, EV, 4) @@ -1196,7 +1201,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1225,7 +1230,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1275,7 +1280,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1303,7 +1308,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1330,7 +1335,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1358,7 +1363,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1406,7 +1411,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1444,7 +1449,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1500,7 +1505,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1528,7 +1533,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1579,7 +1584,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1639,8 +1644,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -1787,7 +1792,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2068,7 +2073,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2345,7 +2350,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2451,7 +2456,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2612,7 +2617,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2731,7 +2737,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2793,8 +2799,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -2814,8 +2821,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -2835,8 +2843,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -2856,8 +2865,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -2877,8 +2886,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_Instructions: @@ -2922,7 +2931,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -2936,8 +2945,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" class Metric_DSB_Coverage: @@ -3020,7 +3028,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -3145,6 +3153,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -3221,8 +3249,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -3241,8 +3268,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -3261,8 +3287,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -3405,7 +3430,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -3424,7 +3449,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3478,7 +3503,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -3845,6 +3870,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n diff --git a/bdx_server_ratios.py b/bdx_server_ratios.py index fa52a13a..7a55edc1 100644 --- a/bdx_server_ratios.py +++ b/bdx_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v4 (code named Broadwell EP) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v4 (code named Broadwell EP) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -49,6 +52,8 @@ def handle_error_metric(obj, msg): OneBillion = 1000000000 Energy_Unit = 61 Errata_Whitelist = "BDE69;BDE70" +EBS_Mode = 0 +DS = 1 # Aux. formulas @@ -88,11 +93,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def Frontend_RS_Empty_Cycles(self, EV, level): EV("RS_EVENTS.EMPTY_CYCLES", level) @@ -207,7 +212,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -241,13 +246,13 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -273,7 +278,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -289,31 +294,31 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -327,7 +332,6 @@ def Instructions(self, EV, level): def Retire(self, EV, level): return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) @@ -347,7 +351,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -377,6 +381,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -389,15 +397,12 @@ def L2HPKI_Load(self, EV, level): def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -433,11 +438,11 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -447,7 +452,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -509,7 +514,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -571,7 +576,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -596,7 +601,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -653,7 +658,7 @@ class Mispredicts_Resteers: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -701,7 +706,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -932,7 +937,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -963,7 +968,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -993,7 +998,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1085,8 +1090,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION:c1", 4) + 7 * EV("DTLB_LOAD_MISSES.WALK_COMPLETED", 4)) / CLKS(self, EV, 4) @@ -1228,7 +1233,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1257,7 +1262,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1307,7 +1312,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1335,7 +1340,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1362,7 +1367,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1390,7 +1395,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1438,7 +1443,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1476,7 +1481,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1605,7 +1610,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1633,7 +1638,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1684,7 +1689,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1744,8 +1749,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -1892,7 +1897,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2173,7 +2178,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2450,7 +2455,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2556,7 +2561,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2717,7 +2722,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2836,7 +2842,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2898,8 +2904,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -2919,8 +2926,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -2940,8 +2948,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -2961,8 +2970,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -2982,8 +2991,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_Instructions: @@ -3027,7 +3036,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -3041,8 +3050,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" class Metric_DSB_Coverage: @@ -3125,7 +3133,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -3250,6 +3258,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -3326,8 +3354,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -3346,8 +3373,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -3366,8 +3392,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -3510,7 +3535,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -3529,7 +3554,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3602,7 +3627,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -3997,6 +4022,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n diff --git a/clx_server_ratios.py b/clx_server_ratios.py index 89d9553c..04517706 100644 --- a/clx_server_ratios.py +++ b/clx_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 2nd gen (code named Cascade Lake) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 2nd gen (code named Cascade Lake) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 1 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,10 @@ def handle_error_metric(obj, msg): OneBillion = 1000000000 Energy_Unit = 61 Errata_Whitelist = "SKL091" +EBS_Mode = 0 +Memory = 1 +PMM_App_Direct = 1 if Memory == 1 else 0 +DS = 1 # Aux. formulas @@ -59,7 +66,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -113,7 +120,7 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): @@ -139,19 +146,24 @@ def LOAD_LCL_MEM(self, EV, level): return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) def LOAD_LCL_PMM(self, EV, level): - return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) + return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_FWD(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_HITM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_MEM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_PMM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_XSNP_HIT(self, EV, level): return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT", level) @@ -166,7 +178,7 @@ def MEM_Bound_Ratio(self, EV, level): return EV("CYCLE_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) + L2_Bound_Ratio(self, EV, level) - self.L2_Bound.compute(EV) def Mem_DDR_Hit_Fraction(self, EV, level): - return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) + return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1 def Mem_Lock_St_Fraction(self, EV, level): return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level) @@ -233,11 +245,9 @@ def Retired_Slots(self, EV, level): def Num_CPUs(self, EV, level): return 112 if smt_enabled else 56 -def Memory(self, EV, level): - return 1 - -def PMM_App_Direct(self, EV, level): - return 1 if Memory(self, EV, level)== 1 else 0 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -251,7 +261,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -259,23 +269,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -293,24 +303,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -321,7 +337,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -355,7 +371,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -363,15 +379,9 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -397,7 +407,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -413,37 +423,37 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX512(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -473,10 +483,17 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -497,6 +514,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2 @@ -531,7 +554,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -541,7 +564,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -589,6 +612,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -605,19 +632,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -681,11 +704,11 @@ def UC_Load_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -695,7 +718,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -747,7 +770,8 @@ def MEM_Parallel_Reads(self, EV, level): # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_PMM_Read_Latency(self, EV, level): - return (OneBillion *(EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level) / EV("UNC_M_PMM_RPQ_INSERTS", level)) / EV("UNC_M_CLOCKTICKS:one_unit", level)) + EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level) + return (OneBillion *(EV("UNC_M_PMM_RPQ_OCCUPANCY.ALL", level) / EV("UNC_M_PMM_RPQ_INSERTS", level)) / EV("UNC_M_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_DRAM_Read_Latency(self, EV, level): @@ -755,11 +779,11 @@ def MEM_DRAM_Read_Latency(self, EV, level): # Average 3DXP Memory Bandwidth Use for reads [GB / sec] def PMM_Read_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec] def PMM_Write_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU def IO_Read_BW(self, EV, level): @@ -797,7 +821,7 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -859,7 +883,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -884,7 +908,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -941,7 +965,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -989,7 +1013,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1243,7 +1267,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1274,7 +1298,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1298,7 +1322,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1328,7 +1352,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1352,7 +1376,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1444,8 +1468,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1476,7 +1500,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1500,7 +1524,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1545,13 +1569,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1635,7 +1684,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1664,7 +1713,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1714,7 +1763,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1742,7 +1791,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1769,7 +1818,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1797,7 +1846,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1825,7 +1874,7 @@ class DRAM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = (MEM_Bound_Ratio(self, EV, 3) - self.PMM_Bound.compute(EV)) + self.val = (MEM_Bound_Ratio(self, EV, 3) - self.PMM_Bound.compute(EV)) if PMM_App_Direct else MEM_Bound_Ratio(self, EV, 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "DRAM_Bound zero division") @@ -1845,7 +1894,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1883,7 +1932,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1909,7 +1958,7 @@ class Local_MEM: area = "BE/Mem" level = 5 htoff = False - sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp'] + sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM'] errcount = 0 sibling = None metricgroup = frozenset(['Server']) @@ -1940,7 +1989,8 @@ class Remote_MEM: maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_MEM zero division") @@ -1964,7 +2014,9 @@ class Remote_Cache: maxval = 1.0 def compute(self, EV): try: - self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_Cache zero division") @@ -1989,10 +2041,10 @@ class PMM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 ) - EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0 EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3) EV("MEM_LOAD_RETIRED.L1_MISS", 3) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "PMM_Bound zero division") @@ -2041,7 +2093,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -2069,7 +2121,7 @@ class False_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -2120,7 +2172,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2228,8 +2280,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2254,7 +2306,7 @@ class Serializing_Operation: sample = ['PARTIAL_RAT_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2342,7 +2394,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4) + self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2453,7 +2505,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2734,7 +2786,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -3005,7 +3057,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3017,8 +3069,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -3031,7 +3083,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3081,7 +3133,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3186,7 +3238,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3277,7 +3329,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3297,7 +3349,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3318,7 +3370,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3329,7 +3381,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3338,7 +3391,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3358,7 +3411,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3378,7 +3431,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3398,7 +3451,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3408,7 +3461,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3419,7 +3472,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3440,7 +3493,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3462,7 +3515,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3472,10 +3525,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3484,7 +3536,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3494,31 +3546,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3573,7 +3648,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3734,7 +3809,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3756,26 +3832,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3892,7 +3948,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3954,8 +4010,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3975,8 +4032,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3996,8 +4054,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -4017,8 +4076,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -4038,8 +4097,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX512: @@ -4059,8 +4118,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX512 zero division") desc = """ Instructions per FP Arithmetic AVX 512-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -4166,7 +4225,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4180,8 +4239,45 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4267,6 +4363,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4422,7 +4538,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4463,7 +4579,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4684,6 +4800,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -4781,8 +4917,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4801,8 +4936,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4821,8 +4955,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4841,8 +4974,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5128,7 +5260,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5147,7 +5279,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5220,7 +5352,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5644,6 +5776,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n @@ -5736,6 +5869,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] @@ -5814,7 +5948,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5827,7 +5962,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5848,10 +5982,13 @@ def __init__(self, r): n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5873,6 +6010,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -5950,16 +6088,7 @@ def __init__(self, r): o["Core_Bound"].Retiring = o["Retiring"] o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"] o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilization"].Core_Bound = o["Core_Bound"] - o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Retiring = o["Retiring"] - o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilized_0"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"] - o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"] o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -6040,6 +6169,7 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Bandwidth"].Backend_Bound = o["Backend_Bound"] o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] @@ -6056,28 +6186,35 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] + o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Latency"].Retiring = o["Retiring"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"] o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] - o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] - o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] - o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"] @@ -6085,6 +6222,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"] o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -6157,20 +6295,12 @@ def __init__(self, r): o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"] o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"] o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] - o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] o["Other_Bottlenecks"].Retiring = o["Retiring"] - o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"] o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"] o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"] - o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] - o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"] - o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] - o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] - o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"] @@ -6178,43 +6308,52 @@ def __init__(self, r): o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"] o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"] + o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] + o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] + o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] + o["Other_Bottlenecks"].FB_Full = o["FB_Full"] + o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] + o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] + o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] + o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] + o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] + o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] + o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] + o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"] + o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] + o["Other_Bottlenecks"].Divider = o["Divider"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].Assists = o["Assists"] o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"] o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"] o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"] - o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] - o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] - o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] - o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] + o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"] - o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"] - o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] - o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"] o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"] o["Other_Bottlenecks"].LCP = o["LCP"] - o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] - o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"] - o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"] o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"] - o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] - o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] - o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] + o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] @@ -6234,6 +6373,11 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6309,5 +6453,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py index 0d7dd304..b24bf58f 100644 --- a/hsw_client_ratios.py +++ b/hsw_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 4rd gen Core (code named Haswell) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 4rd gen Core (code named Haswell) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 +EBS_Mode = 0 +DS = 0 # Aux. formulas @@ -179,7 +184,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -201,13 +206,13 @@ def SLOTS(self, EV, level): def CoreIPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(self, EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(self, EV, level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -233,7 +238,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -267,7 +272,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -289,19 +294,20 @@ def L1MPKI(self, EV, level): def L2MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L3 cache true misses per kilo instruction for retired demand loads def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -337,11 +343,11 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -401,7 +407,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -463,7 +469,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -488,7 +494,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -750,7 +756,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -781,7 +787,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -811,7 +817,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -903,8 +909,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -1046,7 +1052,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1075,7 +1081,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1125,7 +1131,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1153,7 +1159,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1180,7 +1186,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1208,7 +1214,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1256,7 +1262,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1294,7 +1300,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1350,7 +1356,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1378,7 +1384,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1429,7 +1435,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1489,8 +1495,8 @@ class Divider: sample = ['ARITH.DIVIDER_UOPS'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(self, EV, 3) @@ -1637,7 +1643,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -1918,7 +1924,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2042,7 +2048,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2148,7 +2154,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2247,7 +2253,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2366,7 +2373,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2508,7 +2515,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -2593,6 +2600,26 @@ def compute(self, EV): loads""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L3MPKI: name = "L3MPKI" domain = "Metric" @@ -2629,8 +2656,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -2649,8 +2675,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -2669,8 +2694,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -2813,7 +2837,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -2832,7 +2856,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3202,6 +3226,7 @@ def __init__(self, r): n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n diff --git a/hsx_server_ratios.py b/hsx_server_ratios.py index 755f2349..29ad7ea2 100644 --- a/hsx_server_ratios.py +++ b/hsx_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v3 (code Named Haswell EP) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v3 (code Named Haswell EP) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 +EBS_Mode = 0 +DS = 1 # Aux. formulas @@ -203,7 +208,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -225,13 +230,13 @@ def SLOTS(self, EV, level): def CoreIPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(self, EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(self, EV, level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -257,7 +262,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -291,7 +296,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -313,19 +318,20 @@ def L1MPKI(self, EV, level): def L2MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L3 cache true misses per kilo instruction for retired demand loads def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -361,11 +367,11 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -433,7 +439,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -495,7 +501,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -520,7 +526,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -782,7 +788,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -813,7 +819,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -843,7 +849,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -935,8 +941,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -1078,7 +1084,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1107,7 +1113,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1157,7 +1163,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1185,7 +1191,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1212,7 +1218,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1240,7 +1246,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1288,7 +1294,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1326,7 +1332,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1455,7 +1461,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1483,7 +1489,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1534,7 +1540,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1594,8 +1600,8 @@ class Divider: sample = ['ARITH.DIVIDER_UOPS'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(self, EV, 3) @@ -1742,7 +1748,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2023,7 +2029,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2147,7 +2153,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2253,7 +2259,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2352,7 +2358,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2471,7 +2478,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2613,7 +2620,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -2698,6 +2705,26 @@ def compute(self, EV): loads""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L3MPKI: name = "L3MPKI" domain = "Metric" @@ -2734,8 +2761,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -2754,8 +2780,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -2774,8 +2799,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -2918,7 +2942,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -2937,7 +2961,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3354,6 +3378,7 @@ def __init__(self, r): n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n diff --git a/icl_client_ratios.py b/icl_client_ratios.py index 114f6aa2..c3a7d3d9 100644 --- a/icl_client_ratios.py +++ b/icl_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 10th gen Core (code name Icelake) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 10th gen Core (code name Icelake) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,11 +16,14 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 - +num_cores = 1 +num_threads = 1 +num_sockets = 1 +topdown_use_fixed = False def handle_error(obj, msg): print_error(msg) @@ -47,6 +50,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 +PERF_METRICS_MSR = 1 +DS = 0 # Aux. formulas @@ -58,7 +63,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -103,11 +108,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -184,6 +189,10 @@ def Retired_Slots(self, EV, level): def Num_CPUs(self, EV, level): return 8 if smt_enabled else 4 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 + # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV))) @@ -196,7 +205,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -204,23 +213,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -238,24 +247,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -266,7 +281,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -282,7 +297,7 @@ def CLKS(self, EV, level): # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward) def SLOTS(self, EV, level): - return EV("TOPDOWN.SLOTS", level) + return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level) # Fraction of Physical Core issue-slots utilized by this Logical Processor def Slots_Utilization(self, EV, level): @@ -304,7 +319,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -312,12 +327,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -346,7 +355,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -362,37 +371,37 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX512(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -422,10 +431,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -450,6 +470,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) @@ -484,25 +510,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -512,7 +538,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -564,6 +590,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all demand loads (including speculative) def L2HPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_HIT", level) / EV("INST_RETIRED.ANY", level) @@ -576,19 +606,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -652,17 +678,17 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -736,11 +762,11 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.15) except ZeroDivisionError: handle_error(self, "Frontend_Bound zero division") @@ -798,7 +824,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -823,7 +849,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -880,7 +906,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -928,7 +954,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1232,7 +1258,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1263,7 +1289,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1287,7 +1313,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1317,7 +1343,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1341,11 +1367,11 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) + Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.2) except ZeroDivisionError: handle_error(self, "Backend_Bound zero division") @@ -1433,8 +1459,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1465,7 +1491,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1489,7 +1515,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1534,13 +1560,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1624,7 +1675,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1653,7 +1704,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1703,7 +1754,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1731,7 +1782,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1758,7 +1809,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1786,7 +1837,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1834,7 +1885,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1872,7 +1923,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1928,7 +1979,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1956,7 +2007,7 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -2011,7 +2062,8 @@ class Streaming_Stores: maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -2035,7 +2087,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2143,8 +2195,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2169,7 +2221,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2257,7 +2309,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4) + self.val = EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2368,7 +2420,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2557,11 +2609,11 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh except ZeroDivisionError: handle_error(self, "Retiring zero division") @@ -2829,7 +2881,7 @@ class Branch_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2876,7 +2928,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2982,7 +3034,7 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3073,7 +3125,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3093,7 +3145,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3114,7 +3166,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3125,7 +3177,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3134,7 +3187,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3154,7 +3207,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3174,7 +3227,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3194,7 +3247,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3204,7 +3257,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3215,7 +3268,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3236,7 +3289,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3258,7 +3311,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3268,10 +3321,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3280,7 +3332,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3290,31 +3342,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3369,7 +3444,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3550,7 +3625,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3572,26 +3648,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3708,7 +3764,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3770,8 +3826,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3791,8 +3848,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3812,8 +3870,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -3833,8 +3892,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -3854,8 +3913,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX512: @@ -3875,8 +3934,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX512 zero division") desc = """ Instructions per FP Arithmetic AVX 512-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -3982,7 +4041,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -3996,8 +4055,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4103,6 +4218,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4258,7 +4393,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4278,7 +4413,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4298,7 +4433,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4318,7 +4453,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4359,7 +4494,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4600,6 +4735,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_Load: name = "L2HPKI_Load" domain = "Metric" @@ -4677,8 +4832,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4697,8 +4851,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4717,8 +4870,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4737,8 +4889,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5021,7 +5172,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5040,7 +5191,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5094,7 +5245,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5376,6 +5527,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n @@ -5462,6 +5614,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] @@ -5532,7 +5685,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5546,7 +5700,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5567,11 +5720,15 @@ def __init__(self, r): n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5597,6 +5754,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_FB_HPKI() ; r.metric(n) ; o["FB_HPKI"] = n @@ -5675,14 +5833,7 @@ def __init__(self, r): o["Core_Bound"].Retiring = o["Retiring"] o["Core_Bound"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilization"].Core_Bound = o["Core_Bound"] - o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilized_0"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"] - o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"] o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -5774,8 +5925,9 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].Retiring = o["Retiring"] + o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] @@ -5791,38 +5943,46 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"] o["Cache_Memory_Latency"].Retiring = o["Retiring"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] - o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] - o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] - o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] - o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] - o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] - o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] + o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"] o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"] o["Memory_Data_TLBs"].Retiring = o["Retiring"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Store_Bound = o["Store_Bound"] o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -5889,19 +6049,12 @@ def __init__(self, r): o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"] o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"] o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] - o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] o["Other_Bottlenecks"].Retiring = o["Retiring"] o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"] o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"] - o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] - o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"] - o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] - o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] - o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"] @@ -5909,52 +6062,60 @@ def __init__(self, r): o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"] o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"] + o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] + o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] + o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] + o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] + o["Other_Bottlenecks"].FB_Full = o["FB_Full"] + o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] + o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] + o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] + o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] + o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] + o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] + o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] + o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] + o["Other_Bottlenecks"].Divider = o["Divider"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].Assists = o["Assists"] o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"] o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"] o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"] - o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] - o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] - o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] - o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] - o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] + o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"] - o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"] - o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] - o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"] o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"] o["Other_Bottlenecks"].LCP = o["LCP"] - o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] - o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"] - o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"] o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"] - o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] - o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] - o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Assists = o["Assists"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] - o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] - o["Core_Bound_Likely"].Retiring = o["Retiring"] + o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] + o["Useful_Work"].Assists = o["Assists"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] + o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["Core_Bound_Likely"].Retiring = o["Retiring"] + o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].LSD = o["LSD"] o["DSB_Misses"].MITE = o["MITE"] @@ -5969,6 +6130,12 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].LSD = o["LSD"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6045,5 +6212,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/icx_server_ratios.py b/icx_server_ratios.py index 5ac5e0c6..673718a6 100644 --- a/icx_server_ratios.py +++ b/icx_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 3rd gen (code name Icelake Server) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 3rd gen (code name Icelake Server) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,11 +16,14 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 - +num_cores = 1 +num_threads = 1 +num_sockets = 1 +topdown_use_fixed = False def handle_error(obj, msg): print_error(msg) @@ -47,6 +50,10 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 +Memory = 1 +PMM_App_Direct = 1 if Memory == 1 else 0 +PERF_METRICS_MSR = 1 +DS = 1 # Aux. formulas @@ -58,7 +65,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -103,11 +110,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -129,19 +136,24 @@ def LOAD_LCL_MEM(self, EV, level): return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) def LOAD_LCL_PMM(self, EV, level): - return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) + return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_FWD(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_HITM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_MEM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_PMM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_XSNP_HIT(self, EV, level): return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT", level) @@ -156,7 +168,7 @@ def MEM_Bound_Ratio(self, EV, level): return EV("CYCLE_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) + L2_Bound_Ratio(self, EV, level) - self.L2_Bound.compute(EV) def Mem_DDR_Hit_Fraction(self, EV, level): - return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) + return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1 def Mem_Lock_St_Fraction(self, EV, level): return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level) @@ -215,13 +227,11 @@ def Retired_Slots(self, EV, level): # Number of logical processors (enabled or online) on the target system def Num_CPUs(self, EV, level): - return 160 if smt_enabled else 80 - -def Memory(self, EV, level): - return 1 + return num_cores * num_sockets * num_threads if num_cores else 160 /(2 - smt_enabled ) -def PMM_App_Direct(self, EV, level): - return 1 if Memory(self, EV, level)== 1 else 0 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -235,7 +245,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -243,23 +253,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -277,24 +287,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -305,7 +321,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -321,7 +337,7 @@ def CLKS(self, EV, level): # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward) def SLOTS(self, EV, level): - return EV("TOPDOWN.SLOTS", level) + return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level) # Fraction of Physical Core issue-slots utilized by this Logical Processor def Slots_Utilization(self, EV, level): @@ -343,7 +359,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -351,12 +367,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -385,7 +395,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -401,37 +411,37 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX512(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -461,10 +471,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -485,6 +506,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) @@ -519,25 +546,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -547,7 +574,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -599,6 +626,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all demand loads (including speculative) def L2HPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_HIT", level) / EV("INST_RETIRED.ANY", level) @@ -611,19 +642,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -695,11 +722,11 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -709,7 +736,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -761,7 +788,9 @@ def MEM_Parallel_Reads(self, EV, level): # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_PMM_Read_Latency(self, EV, level): - return 0 + EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) + EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level) + return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_DRAM_Read_Latency(self, EV, level): @@ -769,11 +798,11 @@ def MEM_DRAM_Read_Latency(self, EV, level): # Average 3DXP Memory Bandwidth Use for reads [GB / sec] def PMM_Read_BW(self, EV, level): - return 0 + return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec] def PMM_Write_BW(self, EV, level): - return 0 + return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU def IO_Read_BW(self, EV, level): @@ -811,11 +840,11 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.15) except ZeroDivisionError: handle_error(self, "Frontend_Bound zero division") @@ -873,7 +902,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -898,7 +927,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -955,7 +984,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -1003,7 +1032,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1280,7 +1309,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1311,7 +1340,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1335,7 +1364,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1365,7 +1394,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1389,11 +1418,11 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + (Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) + Pipeline_Width * EV("INT_MISC.CLEARS_COUNT", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.2) except ZeroDivisionError: handle_error(self, "Backend_Bound zero division") @@ -1481,8 +1510,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1513,7 +1542,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1537,7 +1566,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1582,13 +1611,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1672,7 +1726,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1701,7 +1755,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1751,7 +1805,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1779,7 +1833,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1806,7 +1860,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1834,7 +1888,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1882,7 +1936,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1920,7 +1974,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1946,7 +2000,7 @@ class Local_MEM: area = "BE/Mem" level = 5 htoff = False - sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp'] + sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM'] errcount = 0 sibling = None metricgroup = frozenset(['Server']) @@ -1977,7 +2031,8 @@ class Remote_MEM: maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_MEM zero division") @@ -2001,7 +2056,9 @@ class Remote_Cache: maxval = 1.0 def compute(self, EV): try: - self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_Cache zero division") @@ -2026,7 +2083,10 @@ class PMM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = 0 + self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0 + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3) + EV("MEM_LOAD_RETIRED.L1_MISS", 3) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "PMM_Bound zero division") @@ -2075,7 +2135,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -2103,7 +2163,7 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -2158,7 +2218,8 @@ class Streaming_Stores: maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -2182,7 +2243,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2290,8 +2351,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2316,7 +2377,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2404,7 +2465,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4) + self.val = EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2515,7 +2576,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2704,11 +2765,11 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh except ZeroDivisionError: handle_error(self, "Retiring zero division") @@ -2976,7 +3037,7 @@ class Branch_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3023,7 +3084,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3129,7 +3190,7 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3220,7 +3281,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3240,7 +3301,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3261,7 +3322,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3272,7 +3333,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3281,7 +3343,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3301,7 +3363,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3321,7 +3383,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3341,7 +3403,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3351,7 +3413,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3362,7 +3424,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3383,7 +3445,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3405,7 +3467,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3415,10 +3477,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3427,7 +3488,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3437,31 +3498,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3516,7 +3600,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3697,7 +3781,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3719,26 +3804,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3855,7 +3920,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3917,8 +3982,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3938,8 +4004,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3959,8 +4026,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -3980,8 +4048,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -4001,8 +4069,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX512: @@ -4022,8 +4090,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX512 zero division") desc = """ Instructions per FP Arithmetic AVX 512-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -4129,7 +4197,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4143,8 +4211,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4230,6 +4354,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4385,7 +4529,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4405,7 +4549,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4425,7 +4569,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4445,7 +4589,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4486,7 +4630,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4727,6 +4871,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_Load: name = "L2HPKI_Load" domain = "Metric" @@ -4804,8 +4968,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4824,8 +4987,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4844,8 +5006,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4864,8 +5025,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5189,7 +5349,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5208,7 +5368,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5281,7 +5441,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5706,6 +5866,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n @@ -5795,6 +5956,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] @@ -5869,7 +6031,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5883,7 +6046,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5904,10 +6066,14 @@ def __init__(self, r): n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5933,6 +6099,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_FB_HPKI() ; r.metric(n) ; o["FB_HPKI"] = n @@ -6015,20 +6182,14 @@ def __init__(self, r): o["Load_STLB_Hit"].DTLB_Load = o["DTLB_Load"] o["DRAM_Bound"].L2_Bound = o["L2_Bound"] o["MEM_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["PMM_Bound"].L2_Bound = o["L2_Bound"] o["Store_STLB_Hit"].DTLB_Store = o["DTLB_Store"] o["Store_STLB_Hit"].Store_STLB_Miss = o["Store_STLB_Miss"] o["Core_Bound"].Memory_Bound = o["Memory_Bound"] o["Core_Bound"].Retiring = o["Retiring"] o["Core_Bound"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilization"].Core_Bound = o["Core_Bound"] - o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilized_0"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"] - o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"] o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -6117,45 +6278,53 @@ def __init__(self, r): o["Instruction_Fetch_BW"].MS_Switches = o["MS_Switches"] o["Instruction_Fetch_BW"].Unknown_Branches = o["Unknown_Branches"] o["Cache_Memory_Bandwidth"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Bandwidth"].G4K_Aliasing = o["G4K_Aliasing"] o["Cache_Memory_Bandwidth"].Retiring = o["Retiring"] o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] - o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] - o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] - o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Bandwidth"].Backend_Bound = o["Backend_Bound"] - o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] + o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] - o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] + o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Latency"].Retiring = o["Retiring"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] - o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] - o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] - o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] - o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] + o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] @@ -6163,15 +6332,16 @@ def __init__(self, r): o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"] o["Memory_Data_TLBs"].Retiring = o["Retiring"] o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Store_Bound = o["Store_Bound"] o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -6239,20 +6409,12 @@ def __init__(self, r): o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"] o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"] o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] - o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] o["Other_Bottlenecks"].Retiring = o["Retiring"] - o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"] o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"] o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"] - o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] - o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"] - o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] - o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] - o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"] @@ -6260,52 +6422,61 @@ def __init__(self, r): o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"] o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"] + o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] + o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] + o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] + o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] + o["Other_Bottlenecks"].FB_Full = o["FB_Full"] + o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] + o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] + o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] + o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] + o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] + o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] + o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] + o["Other_Bottlenecks"].PMM_Bound = o["PMM_Bound"] + o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] + o["Other_Bottlenecks"].Divider = o["Divider"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].Assists = o["Assists"] o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"] o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"] o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"] - o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] - o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] - o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] - o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] - o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] + o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"] - o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"] - o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] - o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"] o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"] o["Other_Bottlenecks"].LCP = o["LCP"] - o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] - o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"] - o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"] o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"] - o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] - o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] - o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Assists = o["Assists"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] - o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] - o["Core_Bound_Likely"].Retiring = o["Retiring"] + o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] + o["Useful_Work"].Assists = o["Assists"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] + o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["Core_Bound_Likely"].Retiring = o["Retiring"] + o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].MITE = o["MITE"] o["DSB_Misses"].LCP = o["LCP"] @@ -6319,6 +6490,11 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6396,5 +6572,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py index b1adfcfc..495dc1f6 100644 --- a/ivb_client_ratios.py +++ b/ivb_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 3rd gen Core (code named IvyBridge) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 3rd gen Core (code named IvyBridge) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 15.6 +EBS_Mode = 0 +DS = 0 # Aux. formulas @@ -191,7 +196,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -221,13 +226,13 @@ def CoreIPC(self, EV, level): def FLOPc(self, EV, level): return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -253,7 +258,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -263,7 +268,7 @@ def IpTB(self, EV, level): def BpTkBranch(self, EV, level): return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = 1 /(self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV)) self.thresh = (val < 10) @@ -277,7 +282,6 @@ def Instructions(self, EV, level): def Retire(self, EV, level): return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) @@ -297,7 +301,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -319,19 +323,20 @@ def L1MPKI(self, EV, level): def L2MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L3 cache true misses per kilo instruction for retired demand loads def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -367,17 +372,17 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -435,7 +440,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -497,7 +502,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -522,7 +527,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -784,7 +789,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -815,7 +820,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -845,7 +850,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -937,8 +942,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -1080,7 +1085,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1109,7 +1114,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1159,7 +1164,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1187,7 +1192,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1214,7 +1219,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1242,7 +1247,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1290,7 +1295,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1328,7 +1333,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1384,7 +1389,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1412,7 +1417,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1463,7 +1468,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1523,8 +1528,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -1671,7 +1676,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -1905,7 +1910,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2182,7 +2187,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2288,7 +2293,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2427,7 +2432,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2546,7 +2552,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2586,8 +2592,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_Instructions: @@ -2631,7 +2638,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -2645,8 +2652,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" class Metric_DSB_Coverage: @@ -2729,7 +2735,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -2814,6 +2820,26 @@ def compute(self, EV): loads""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L3MPKI: name = "L3MPKI" domain = "Metric" @@ -2850,8 +2876,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -2870,8 +2895,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -2890,8 +2914,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -3034,7 +3057,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -3053,7 +3076,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3107,7 +3130,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -3456,6 +3479,7 @@ def __init__(self, r): n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py index 0351fa1f..a321a9b3 100644 --- a/ivb_server_ratios.py +++ b/ivb_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 v2 (code named IvyBridge EP) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 v2 (code named IvyBridge EP) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 15.6 +EBS_Mode = 0 +DS = 1 # Aux. formulas @@ -215,7 +220,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -245,13 +250,13 @@ def CoreIPC(self, EV, level): def FLOPc(self, EV, level): return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -277,7 +282,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -287,7 +292,7 @@ def IpTB(self, EV, level): def BpTkBranch(self, EV, level): return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = 1 /(self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV)) self.thresh = (val < 10) @@ -301,7 +306,6 @@ def Instructions(self, EV, level): def Retire(self, EV, level): return Retired_Slots(self, EV, level) / EV("UOPS_RETIRED.RETIRE_SLOTS:c1", level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) @@ -321,7 +325,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -343,19 +347,20 @@ def L1MPKI(self, EV, level): def L2MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.L2_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L3 cache true misses per kilo instruction for retired demand loads def L3MPKI(self, EV, level): return 1000 * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) @@ -391,11 +396,11 @@ def Data_L2_MLP(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -405,7 +410,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -467,7 +472,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -529,7 +534,7 @@ class ICache_Misses: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -554,7 +559,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -816,7 +821,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -847,7 +852,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -877,7 +882,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -969,8 +974,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -1112,7 +1117,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1141,7 +1146,7 @@ class L2_Bound: sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1191,7 +1196,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1219,7 +1224,7 @@ class Data_Sharing: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1246,7 +1251,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1274,7 +1279,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1322,7 +1327,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1360,7 +1365,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1489,7 +1494,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1517,7 +1522,7 @@ class False_Sharing: sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1568,7 +1573,7 @@ class DTLB_Store: sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -1628,8 +1633,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -1776,7 +1781,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2010,7 +2015,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2287,7 +2292,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY_WB_ASSIST'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -2393,7 +2398,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -2532,7 +2537,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -2651,7 +2657,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -2691,8 +2697,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_Instructions: @@ -2736,7 +2743,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -2750,8 +2757,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" class Metric_DSB_Coverage: @@ -2834,7 +2840,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -2919,6 +2925,26 @@ def compute(self, EV): loads""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L3MPKI: name = "L3MPKI" domain = "Metric" @@ -2955,8 +2981,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -2975,8 +3000,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -2995,8 +3019,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_Page_Walks_Utilization: @@ -3139,7 +3162,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -3158,7 +3181,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -3231,7 +3254,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -3608,6 +3631,7 @@ def __init__(self, r): n = Metric_MLP() ; r.metric(n) ; o["MLP"] = n n = Metric_L1MPKI() ; r.metric(n) ; o["L1MPKI"] = n n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n n = Metric_L1D_Cache_Fill_BW() ; r.metric(n) ; o["L1D_Cache_Fill_BW"] = n n = Metric_L2_Cache_Fill_BW() ; r.metric(n) ; o["L2_Cache_Fill_BW"] = n diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py index c78712b5..ca9197a2 100644 --- a/jkt_server_ratios.py +++ b/jkt_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon E5 (code named SandyBridge EP) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon E5 (code named SandyBridge EP) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -45,6 +48,8 @@ def handle_error_metric(obj, msg): Pipeline_Width = 4 OneMillion = 1000000 OneBillion = 1000000000 +EBS_Mode = 0 +DS = 1 # Aux. formulas @@ -161,13 +166,13 @@ def CoreIPC(self, EV, level): def FLOPc(self, EV, level): return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_DISPATCHED.THREAD", level) / Execute_Cycles(self, EV, level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Total number of retired Instructions def Instructions(self, EV, level): @@ -185,11 +190,11 @@ def DSB_Coverage(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -199,7 +204,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -261,7 +266,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -323,7 +328,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -527,7 +532,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -558,7 +563,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -588,7 +593,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -650,8 +655,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -730,7 +735,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -768,7 +773,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -855,8 +860,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -914,7 +919,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1356,7 +1361,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -1444,7 +1450,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -1463,7 +1469,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -1536,7 +1542,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: diff --git a/skl_client_ratios.py b/skl_client_ratios.py index 774e13d3..61bbdf65 100644 --- a/skl_client_ratios.py +++ b/skl_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 6th/7th gen Core (code named Skykale/Kabylake/Coffeelake) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 6th/7th gen Core (code named Skykale/Kabylake/Coffeelake) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneBillion = 1000000000 Energy_Unit = 61 Errata_Whitelist = "SKL091" +EBS_Mode = 0 +DS = 0 # Aux. formulas @@ -59,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -113,11 +118,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -200,6 +205,10 @@ def Retired_Slots(self, EV, level): def Num_CPUs(self, EV, level): return 8 if smt_enabled else 4 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 + # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV))) @@ -212,7 +221,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -220,23 +229,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -254,24 +263,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -282,7 +297,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -316,7 +331,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -324,15 +339,9 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -358,7 +367,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -374,31 +383,31 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -428,10 +437,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -456,6 +476,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2 @@ -490,7 +516,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -500,7 +526,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -548,6 +574,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -564,19 +594,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -632,17 +658,17 @@ def UC_Load_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -708,7 +734,7 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -770,7 +796,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -795,7 +821,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -852,7 +878,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -900,7 +926,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1181,7 +1207,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1212,7 +1238,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1236,7 +1262,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1266,7 +1292,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1290,7 +1316,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1382,8 +1408,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1414,7 +1440,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1438,7 +1464,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1483,13 +1509,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1573,7 +1624,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1602,7 +1653,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1652,7 +1703,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1680,7 +1731,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1707,7 +1758,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1735,7 +1786,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1783,7 +1834,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1821,7 +1872,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1877,7 +1928,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -1905,7 +1956,7 @@ class False_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1956,7 +2007,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2064,8 +2115,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2090,7 +2141,7 @@ class Serializing_Operation: sample = ['PARTIAL_RAT_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2178,7 +2229,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4) + self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2289,7 +2340,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2570,7 +2621,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2817,7 +2868,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2829,8 +2880,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -2843,7 +2894,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2893,7 +2944,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2998,7 +3049,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3089,7 +3140,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3109,7 +3160,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3130,7 +3181,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3141,7 +3192,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3150,7 +3202,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3170,7 +3222,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3190,7 +3242,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3210,7 +3262,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3220,7 +3272,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3231,7 +3283,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3252,7 +3304,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3274,7 +3326,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3284,10 +3336,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3296,7 +3347,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3306,31 +3357,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3385,7 +3459,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3546,7 +3620,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3568,26 +3643,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3704,7 +3759,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3766,8 +3821,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3787,8 +3843,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3808,8 +3865,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -3829,8 +3887,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -3850,8 +3908,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -3957,7 +4015,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -3971,8 +4029,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4078,6 +4192,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4233,7 +4367,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4274,7 +4408,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4495,6 +4629,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -4592,8 +4746,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4612,8 +4765,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4632,8 +4784,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4652,8 +4803,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -4898,7 +5048,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -4917,7 +5067,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -4971,7 +5121,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5228,6 +5378,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n @@ -5316,6 +5467,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] @@ -5389,7 +5541,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5402,7 +5555,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5422,11 +5574,15 @@ def __init__(self, r): n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5448,6 +5604,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -5512,16 +5669,7 @@ def __init__(self, r): o["Core_Bound"].Retiring = o["Retiring"] o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"] o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilization"].Core_Bound = o["Core_Bound"] - o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Retiring = o["Retiring"] - o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilized_0"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"] - o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"] o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -5610,6 +5758,7 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] @@ -5617,27 +5766,34 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] - o["Cache_Memory_Latency"].Retiring = o["Retiring"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Latency"].Retiring = o["Retiring"] o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] - o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] - o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"] @@ -5645,6 +5801,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"] o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -5715,19 +5872,12 @@ def __init__(self, r): o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"] o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"] o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] - o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] o["Other_Bottlenecks"].Retiring = o["Retiring"] o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"] o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"] - o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] - o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"] - o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] - o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] - o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"] @@ -5735,43 +5885,51 @@ def __init__(self, r): o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"] o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"] + o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] + o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] + o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] + o["Other_Bottlenecks"].FB_Full = o["FB_Full"] + o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] + o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] + o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] + o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] + o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] + o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] + o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] + o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] + o["Other_Bottlenecks"].Divider = o["Divider"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].Assists = o["Assists"] o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"] o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"] o["Other_Bottlenecks"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"] - o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] - o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] - o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"] - o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"] - o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] - o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"] o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"] o["Other_Bottlenecks"].LCP = o["LCP"] - o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] - o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"] - o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"] o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"] - o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] - o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] - o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] + o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] @@ -5792,6 +5950,12 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].LSD = o["LSD"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -5865,5 +6029,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/skx_server_ratios.py b/skx_server_ratios.py index a417876a..8137f5d0 100644 --- a/skx_server_ratios.py +++ b/skx_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors (code named Skylake Server) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors (code named Skylake Server) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -48,6 +51,8 @@ def handle_error_metric(obj, msg): OneBillion = 1000000000 Energy_Unit = 61 Errata_Whitelist = "SKL091" +EBS_Mode = 0 +DS = 1 # Aux. formulas @@ -59,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - (EV("BR_INST_RETIRED.COND", level) - EV("BR_INST_RETIRED.NOT_TAKEN", level)) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -113,7 +118,7 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): @@ -218,6 +223,10 @@ def Retired_Slots(self, EV, level): def Num_CPUs(self, EV, level): return 8 if smt_enabled else 4 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 + # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): val = 100 *(1 - Umisp(self, EV, level)) * (self.Branch_Mispredicts.compute(EV) + self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV))) @@ -230,7 +239,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -238,25 +247,25 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.G4K_Aliasing.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * ((self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV))) * self.Remote_Cache.compute(EV) / (self.Remote_Cache.compute(EV) + self.Remote_MEM.compute(EV) + self.Local_MEM.compute(EV)) + (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) + val = 100 *(self.Memory_Bound.compute(EV) * ((self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV))) * self.Remote_Cache.compute(EV) / (self.Remote_Cache.compute(EV) + self.Remote_MEM.compute(EV) + self.Local_MEM.compute(EV)) + (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) if DS else 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) return val @@ -272,24 +281,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -300,7 +315,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -334,7 +349,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -342,15 +357,9 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Instructions per Load (lower number means higher occurrence rate). Tip: reduce memory accesses. #Link Opt Guide section: Minimize Register Spills def IpLoad(self, EV, level): @@ -376,7 +385,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -392,37 +401,37 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX512(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) self.thresh = (val < 10) @@ -448,10 +457,17 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -472,6 +488,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_16B.IFDATA_STALL", level) / EV("ICACHE_16B.IFDATA_STALL:c1:e1", level) + 2 @@ -506,7 +528,7 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = Instructions(self, EV, level) / (Retire_Fraction(self, EV, level) * EV("BR_MISP_EXEC.INDIRECT", level)) self.thresh = (val < 1000) @@ -516,7 +538,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -564,6 +586,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("OFFCORE_REQUESTS.DEMAND_RFO", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -580,19 +606,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -656,11 +678,11 @@ def UC_Load_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -670,7 +692,7 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -760,7 +782,7 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -822,7 +844,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -847,7 +869,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -904,7 +926,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -952,7 +974,7 @@ class Unknown_Branches: sample = ['BACLEARS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1206,7 +1228,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1237,7 +1259,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1261,7 +1283,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1291,7 +1313,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1315,7 +1337,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1407,8 +1429,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1439,7 +1461,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1463,7 +1485,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1508,13 +1530,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("CYCLE_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1598,7 +1645,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1627,7 +1674,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1677,7 +1724,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1705,7 +1752,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1732,7 +1779,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1760,7 +1807,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1808,7 +1855,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1846,7 +1893,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1872,7 +1919,7 @@ class Local_MEM: area = "BE/Mem" level = 5 htoff = False - sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp'] + sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM'] errcount = 0 sibling = None metricgroup = frozenset(['Server']) @@ -1903,7 +1950,8 @@ class Remote_MEM: maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_MEM zero division") @@ -1927,7 +1975,9 @@ class Remote_Cache: maxval = 1.0 def compute(self, EV): try: - self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_Cache zero division") @@ -1975,7 +2025,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -2003,11 +2053,11 @@ class False_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_miss_remote_hitm(self, EV, 4) + Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4)) / CLKS(self, EV, 4) + self.val = (Mem_Remote_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_miss_remote_hitm(self, EV, 4) + Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4)) / CLKS(self, EV, 4) if DS else Mem_XSNP_HitM_Cost(self, EV, 4) * OCR_all_rfo_l3_hit_snoop_hitm(self, EV, 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "False_Sharing zero division") @@ -2054,7 +2104,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2162,8 +2212,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIVIDER_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2188,7 +2238,7 @@ class Serializing_Operation: sample = ['PARTIAL_RAT_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2253,7 +2303,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + self.Core_Bound.compute(EV) * EV("RS_EVENTS.EMPTY_CYCLES", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 4)) / CLKS(self, EV, 4) + self.val = EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2364,7 +2414,7 @@ class Ports_Utilized_3m: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2645,7 +2695,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2916,7 +2966,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2928,8 +2978,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -2942,7 +2992,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2992,7 +3042,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3097,7 +3147,7 @@ class Assists: sample = ['OTHER_ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3188,7 +3238,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3208,7 +3258,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3229,7 +3279,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3240,7 +3290,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3249,7 +3300,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3269,7 +3320,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3289,7 +3340,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3309,7 +3360,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3319,7 +3370,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3330,7 +3381,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3351,7 +3402,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3373,7 +3424,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3383,10 +3434,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3395,7 +3445,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3405,31 +3455,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3484,7 +3557,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3645,7 +3718,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -3667,26 +3741,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3803,7 +3857,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -3865,8 +3919,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_SP: @@ -3886,8 +3941,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -3907,8 +3963,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -3928,8 +3985,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -3949,8 +4006,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX512: @@ -3970,8 +4027,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX512 zero division") desc = """ Instructions per FP Arithmetic AVX 512-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpSWPF: @@ -4057,7 +4114,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4071,8 +4128,45 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4158,6 +4252,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4313,7 +4427,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4354,7 +4468,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -4575,6 +4689,26 @@ def compute(self, EV): (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -4672,8 +4806,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4692,8 +4825,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4712,8 +4844,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4732,8 +4863,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5019,7 +5149,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5038,7 +5168,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5111,7 +5241,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -5476,6 +5606,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n @@ -5566,6 +5697,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] @@ -5642,7 +5774,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5655,7 +5788,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5675,10 +5807,13 @@ def __init__(self, r): n = Metric_Retire() ; r.metric(n) ; o["Retire"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5700,6 +5835,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -5772,16 +5908,7 @@ def __init__(self, r): o["Core_Bound"].Retiring = o["Retiring"] o["Core_Bound"].Frontend_Bound = o["Frontend_Bound"] o["Ports_Utilization"].Ports_Utilized_0 = o["Ports_Utilized_0"] - o["Ports_Utilization"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilization"].Core_Bound = o["Core_Bound"] - o["Ports_Utilization"].Backend_Bound = o["Backend_Bound"] o["Ports_Utilization"].Retiring = o["Retiring"] - o["Ports_Utilization"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Memory_Bound = o["Memory_Bound"] - o["Ports_Utilized_0"].Retiring = o["Retiring"] - o["Ports_Utilized_0"].Core_Bound = o["Core_Bound"] - o["Ports_Utilized_0"].Frontend_Bound = o["Frontend_Bound"] - o["Ports_Utilized_0"].Backend_Bound = o["Backend_Bound"] o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -5870,6 +5997,7 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] @@ -5877,27 +6005,34 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] - o["Cache_Memory_Latency"].Retiring = o["Retiring"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Latency"].Retiring = o["Retiring"] o["Cache_Memory_Latency"].Frontend_Bound = o["Frontend_Bound"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].G4K_Aliasing = o["G4K_Aliasing"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].Backend_Bound = o["Backend_Bound"] - o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] - o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].G4K_Aliasing = o["G4K_Aliasing"] @@ -5905,6 +6040,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"] o["Memory_Data_TLBs"].Frontend_Bound = o["Frontend_Bound"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -5980,20 +6116,13 @@ def __init__(self, r): o["Irregular_Overhead"].Other_Nukes = o["Other_Nukes"] o["Irregular_Overhead"].Unknown_Branches = o["Unknown_Branches"] o["Irregular_Overhead"].Fetch_Latency = o["Fetch_Latency"] - o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] o["Other_Bottlenecks"].Local_MEM = o["Local_MEM"] - o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] o["Other_Bottlenecks"].Retiring = o["Retiring"] o["Other_Bottlenecks"].Data_Sharing = o["Data_Sharing"] o["Other_Bottlenecks"].L2_Bound = o["L2_Bound"] - o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] - o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].Contested_Accesses = o["Contested_Accesses"] - o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] - o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] - o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] o["Other_Bottlenecks"].Other_Mispredicts = o["Other_Mispredicts"] @@ -6001,6 +6130,25 @@ def __init__(self, r): o["Other_Bottlenecks"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Other_Bottlenecks"].Ports_Utilized_1 = o["Ports_Utilized_1"] o["Other_Bottlenecks"].Ports_Utilized_2 = o["Ports_Utilized_2"] + o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] + o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] + o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] + o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] + o["Other_Bottlenecks"].FB_Full = o["FB_Full"] + o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] + o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] + o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] + o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] + o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] + o["Other_Bottlenecks"].L1_Bound = o["L1_Bound"] + o["Other_Bottlenecks"].G4K_Aliasing = o["G4K_Aliasing"] + o["Other_Bottlenecks"].Core_Bound = o["Core_Bound"] + o["Other_Bottlenecks"].Divider = o["Divider"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].Assists = o["Assists"] o["Other_Bottlenecks"].Backend_Bound = o["Backend_Bound"] o["Other_Bottlenecks"].Branch_Resteers = o["Branch_Resteers"] @@ -6009,37 +6157,26 @@ def __init__(self, r): o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Fetch_Latency = o["Fetch_Latency"] o["Other_Bottlenecks"].Remote_MEM = o["Remote_MEM"] - o["Other_Bottlenecks"].DTLB_Load = o["DTLB_Load"] + o["Other_Bottlenecks"].Ports_Utilization = o["Ports_Utilization"] o["Other_Bottlenecks"].False_Sharing = o["False_Sharing"] - o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Heavy_Operations = o["Heavy_Operations"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] - o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] + o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].MEM_Latency = o["MEM_Latency"] - o["Other_Bottlenecks"].Store_Bound = o["Store_Bound"] o["Other_Bottlenecks"].Split_Loads = o["Split_Loads"] - o["Other_Bottlenecks"].Bad_Speculation = o["Bad_Speculation"] o["Other_Bottlenecks"].ITLB_Misses = o["ITLB_Misses"] - o["Other_Bottlenecks"].Mispredicts_Resteers = o["Mispredicts_Resteers"] - o["Other_Bottlenecks"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Other_Bottlenecks"].Serializing_Operation = o["Serializing_Operation"] o["Other_Bottlenecks"].DTLB_Store = o["DTLB_Store"] o["Other_Bottlenecks"].Branch_Mispredicts = o["Branch_Mispredicts"] o["Other_Bottlenecks"].LCP = o["LCP"] - o["Other_Bottlenecks"].Split_Stores = o["Split_Stores"] - o["Other_Bottlenecks"].Few_Uops_Instructions = o["Few_Uops_Instructions"] o["Other_Bottlenecks"].Lock_Latency = o["Lock_Latency"] - o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] o["Other_Bottlenecks"].Clears_Resteers = o["Clears_Resteers"] o["Other_Bottlenecks"].MS_Switches = o["MS_Switches"] - o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] - o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] - o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] + o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] @@ -6059,6 +6196,11 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6134,5 +6276,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/snb_client_ratios.py b/snb_client_ratios.py index 6c6c4cbf..a78c893b 100644 --- a/snb_client_ratios.py +++ b/snb_client_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel 2nd gen Core (code named SandyBridge) +# auto generated TopDown/TMA 4.8-full-perf description for Intel 2nd gen Core (code named SandyBridge) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 def handle_error(obj, msg): @@ -45,6 +48,8 @@ def handle_error_metric(obj, msg): Pipeline_Width = 4 OneMillion = 1000000 OneBillion = 1000000000 +EBS_Mode = 0 +DS = 0 # Aux. formulas @@ -161,13 +166,13 @@ def CoreIPC(self, EV, level): def FLOPc(self, EV, level): return FLOP_Count(self, EV, level) / CORE_CLKS(self, EV, level) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_DISPATCHED.THREAD", level) / Execute_Cycles(self, EV, level) # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): - return (EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) + return ((EV("CPU_CLK_UNHALTED.THREAD", level) / 2) * (1 + EV("CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", level) / EV("CPU_CLK_UNHALTED.REF_XCLK", level))) if ebs_mode else(EV("CPU_CLK_UNHALTED.THREAD_ANY", level) / 2) if smt_enabled else CLKS(self, EV, level) # Total number of retired Instructions def Instructions(self, EV, level): @@ -185,17 +190,17 @@ def DSB_Coverage(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -249,7 +254,7 @@ class Frontend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -311,7 +316,7 @@ class ITLB_Misses: sample = ['ITLB_MISSES.WALK_COMPLETED'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -515,7 +520,7 @@ class Branch_Mispredicts: sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -546,7 +551,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -576,7 +581,7 @@ class Backend_Bound: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -638,8 +643,8 @@ class DTLB_Load: sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(self, EV, 4) @@ -718,7 +723,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -756,7 +761,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -843,8 +848,8 @@ class Divider: sample = ['ARITH.FPU_DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(self, EV, 3) @@ -902,7 +907,7 @@ class Retiring: sample = ['UOPS_RETIRED.RETIRE_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1344,7 +1349,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_CORE_CLKS: @@ -1432,7 +1438,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -1451,7 +1457,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -1505,7 +1511,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: diff --git a/spr_max_server_ratios.py b/spr_max_server_ratios.py index ef9a25be..8687f88b 100644 --- a/spr_max_server_ratios.py +++ b/spr_max_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 4th gen MAX (code name Sapphire Rapids) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 4th gen MAX (code name Sapphire Rapids) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,11 +16,14 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 1 Average_Frequency = 0.0 - +num_cores = 1 +num_threads = 1 +num_sockets = 1 +topdown_use_fixed = False def handle_error(obj, msg): print_error(msg) @@ -46,7 +49,13 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 -Errata_Whitelist = "ADL038" +Errata_Whitelist = "ADL038;ADL066" +Memory = 2 +PMM_App_Direct = 1 if Memory == 1 else 0 +HBM = 1 if Memory > 1 else 0 +PERF_METRICS_MSR = 1 +FP16 = 1 +DS = 1 # Aux. formulas @@ -55,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.AMX_Busy.compute(EV) + self.Divider.compute(EV)) @@ -100,11 +109,15 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED.SCALAR", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -120,19 +133,24 @@ def LOAD_LCL_MEM(self, EV, level): return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) def LOAD_LCL_PMM(self, EV, level): - return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) + return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_FWD(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_HITM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_MEM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_PMM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_XSNP_HIT(self, EV, level): return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", level) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", level) * (1 - True_XSNP_HitM_Fraction(self, EV, level)) @@ -147,7 +165,7 @@ def MEM_Bound_Ratio(self, EV, level): return EV("MEMORY_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) def Mem_DDR_Hit_Fraction(self, EV, level): - return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) + return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1 def Mem_Lock_St_Fraction(self, EV, level): return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level) @@ -180,16 +198,16 @@ def Mem_XSNP_None_Cost(self, EV, level): return 12 * Core_Frequency(self, EV, level) def Mem_Local_DRAM_Cost(self, EV, level): - return 108 * Core_Frequency(self, EV, level) + return 109 * Core_Frequency(self, EV, level) def Mem_Remote_DRAM_Cost(self, EV, level): - return 186 * Core_Frequency(self, EV, level) + return 190 * Core_Frequency(self, EV, level) def Mem_Remote_HitM_Cost(self, EV, level): - return 172.5 * Core_Frequency(self, EV, level) + return 170 * Core_Frequency(self, EV, level) def Mem_Remote_Fwd_Cost(self, EV, level): - return 172.5 * Core_Frequency(self, EV, level) + return 170 * Core_Frequency(self, EV, level) def Mem_L2_Hit_Cost(self, EV, level): return 3 * Core_Frequency(self, EV, level) @@ -206,16 +224,11 @@ def Retired_Slots(self, EV, level): # Number of logical processors (enabled or online) on the target system def Num_CPUs(self, EV, level): - return 24 if smt_enabled else 16 - -def Memory(self, EV, level): - return 3 if HBM_Only(self, EV, level) else 2 - -def PMM_App_Direct(self, EV, level): - return 1 if Memory(self, EV, level)== 1 else 0 + return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled)) -def HBM_Only(self, EV, level): - return 0 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -229,7 +242,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -237,23 +250,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))) if HBM_Only(self, EV, level) else(100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))))) + val = (100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = (100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) if HBM_Only(self, EV, level) else 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.HBM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.HBM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -271,24 +284,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -299,7 +318,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -315,7 +334,7 @@ def CLKS(self, EV, level): # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward) def SLOTS(self, EV, level): - return EV("TOPDOWN.SLOTS", level) + return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level) # Fraction of Physical Core issue-slots utilized by this Logical Processor def Slots_Utilization(self, EV, level): @@ -337,7 +356,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -345,12 +364,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -375,11 +388,11 @@ def IpBranch(self, EV, level): # Instructions per (near) call (lower number means higher occurrence rate) def IpCall(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level) + val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level) self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -395,51 +408,49 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_HP(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("INST_RETIRED.ANY", level) + val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else 0 self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("INST_RETIRED.ANY", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) - self.thresh = (val < 10) - return val - -# Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions. -def IpArith_AMX_F16(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.BF16", level) - self.thresh = (val < 10) - return val - -# Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions. -def IpArith_AMX_Int8(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.INT8", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("INST_RETIRED.ANY", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val @@ -473,10 +484,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -501,6 +523,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level) @@ -535,25 +563,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -563,7 +591,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -615,6 +643,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -631,19 +663,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -716,7 +744,7 @@ def Offcore_Read_L3M_PKI(self, EV, level): # High-Bandwidth Memory (HBM) accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches) def Offcore_Read_HBM_PKI(self, EV, level): EV("OCR.DEMAND_DATA_RD.PMM", level) - return Offcore_Read_L3M_PKI(self, EV, level) if Memory(self, EV, level)> 2 else 1000 * EV("OCR.DEMAND_DATA_RD.PMM", level) / Instructions(self, EV, level) + return Offcore_Read_L3M_PKI(self, EV, level) if Memory > 2 else 1000 * EV("OCR.DEMAND_DATA_RD.PMM", level) / Instructions(self, EV, level) # Off-core accesses per kilo instruction for modified write requests def Offcore_MWrite_Any_PKI(self, EV, level): @@ -732,11 +760,11 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -746,18 +774,10 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) -# Giga Integer (matrix) Operations Per Second -def GIOPs(self, EV, level): - return 8 * EV("AMX_OPS_RETIRED.INT8", level) / 1e9 / Time(self, EV, level) - -# Tera Integer (matrix) Operations Per Second -def TIOPs(self, EV, level): - return GIOPs(self, EV, level) / 1000 - # Average Frequency Utilization relative nominal frequency def Turbo_Utilization(self, EV, level): return CLKS(self, EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) @@ -801,7 +821,7 @@ def R2C_DRAM_BW(self, EV, level): # Average HBM BW for Reads-to-Core. See R2C_Offcore_BW. def R2C_HBM_BW(self, EV, level): EV("OCR.DEMAND_DATA_RD.PMM", level) - return R2C_DRAM_BW(self, EV, level) if Memory(self, EV, level)> 2 else 64 * EV("OCR.DEMAND_DATA_RD.PMM", level) / 1e9 / Time(self, EV, level) + return R2C_DRAM_BW(self, EV, level) if Memory > 2 else 64 * EV("OCR.DEMAND_DATA_RD.PMM", level) / 1e9 / Time(self, EV, level) # Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. memory-controller only def MEM_Read_Latency(self, EV, level): @@ -813,7 +833,9 @@ def MEM_Parallel_Reads(self, EV, level): # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_PMM_Read_Latency(self, EV, level): - return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) + EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) + EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level) + return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_DRAM_Read_Latency(self, EV, level): @@ -821,11 +843,11 @@ def MEM_DRAM_Read_Latency(self, EV, level): # Average 3DXP Memory Bandwidth Use for reads [GB / sec] def PMM_Read_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec] def PMM_Write_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU def IO_Read_BW(self, EV, level): @@ -867,11 +889,11 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.15) except ZeroDivisionError: handle_error(self, "Frontend_Bound zero division") @@ -906,7 +928,7 @@ class Fetch_Latency: maxval = None def compute(self, EV): try: - self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) + self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Fetch_Latency zero division") @@ -929,7 +951,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -954,7 +976,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -1011,7 +1033,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -1059,7 +1081,7 @@ class Unknown_Branches: sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1082,7 +1104,7 @@ class MS_Switches: area = "FE" level = 3 htoff = False - sample = ['IDQ.MS_SWITCHES'] + sample = ['FRONTEND_RETIRED.MS_FLOWS'] errcount = 0 sibling = None metricgroup = frozenset(['FetchLat', 'MicroSeq']) @@ -1313,11 +1335,11 @@ class Branch_Mispredicts: sample = ['TOPDOWN.BR_MISPREDICT_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Branch_Mispredicts zero division") @@ -1344,7 +1366,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1368,7 +1390,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1398,7 +1420,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1422,11 +1444,11 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.2) except ZeroDivisionError: handle_error(self, "Backend_Bound zero division") @@ -1458,7 +1480,7 @@ class Memory_Bound: maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Memory_Bound zero division") @@ -1514,8 +1536,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1546,7 +1568,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1570,7 +1592,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1615,13 +1637,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1674,7 +1721,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1703,7 +1750,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1753,7 +1800,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1781,7 +1828,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1808,7 +1855,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1836,7 +1883,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1864,9 +1911,7 @@ class HBM_Bound: maxval = None def compute(self, EV): try: - self.val = MEM_Bound_Ratio(self, EV, 3) if HBM_Only(self, EV, 3) else MEM_Bound_Ratio(self, EV, 3) * EV("OCR.DEMAND_DATA_RD.PMM", 3) / EV("OCR.READS_TO_CORE.L3_MISS", 3) - EV("OCR.READS_TO_CORE.L3_MISS", 3) - EV("OCR.DEMAND_DATA_RD.PMM", 3) + self.val = MEM_Bound_Ratio(self, EV, 3) * EV("OCR.DEMAND_DATA_RD.PMM", 3) / EV("OCR.READS_TO_CORE.L3_MISS", 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "HBM_Bound zero division") @@ -1889,7 +1934,7 @@ class DRAM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = self.HBM_Bound.compute(EV) if HBM_Only(self, EV, 3) else MEM_Bound_Ratio(self, EV, 3) - self.HBM_Bound.compute(EV) + self.val = MEM_Bound_Ratio(self, EV, 3) - self.HBM_Bound.compute(EV) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "DRAM_Bound zero division") @@ -1909,7 +1954,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1971,7 +2016,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1997,7 +2042,7 @@ class Local_MEM: area = "BE/Mem" level = 5 htoff = False - sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp'] + sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM'] errcount = 0 sibling = None metricgroup = frozenset(['Server']) @@ -2028,7 +2073,8 @@ class Remote_MEM: maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_MEM zero division") @@ -2052,7 +2098,9 @@ class Remote_Cache: maxval = 1.0 def compute(self, EV): try: - self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_Cache zero division") @@ -2077,10 +2125,10 @@ class PMM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 ) - EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0 EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3) EV("MEM_LOAD_RETIRED.L1_MISS", 3) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "PMM_Bound zero division") @@ -2129,7 +2177,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -2157,7 +2205,7 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -2212,7 +2260,8 @@ class Streaming_Stores: maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -2236,7 +2285,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2344,8 +2393,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2370,7 +2419,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2489,7 +2538,7 @@ class AMX_Busy: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Compute', 'HPC', 'Server']) + metricgroup = frozenset(['BvCB', 'Compute', 'HPC', 'Server']) maxval = None def compute(self, EV): try: @@ -2553,7 +2602,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) + self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2663,7 +2712,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2826,11 +2875,11 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh except ZeroDivisionError: handle_error(self, "Retiring zero division") @@ -2900,7 +2949,7 @@ class FP_Arith: maxval = None def compute(self, EV): try: - self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) + self.FP_AMX.compute(EV) + self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Arith zero division") @@ -3003,7 +3052,10 @@ class FP_Vector_128b: maxval = 1.0 def compute(self, EV): try: - self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) + self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Vector_128b zero division") @@ -3028,7 +3080,10 @@ class FP_Vector_256b: maxval = 1.0 def compute(self, EV): try: - self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) + self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Vector_256b zero division") @@ -3040,32 +3095,6 @@ def compute(self, EV): length""" -class FP_AMX: - name = "FP_AMX" - domain = "Uops_Estimated" - area = "RET" - level = 4 - htoff = False - sample = [] - errcount = 0 - sibling = None - metricgroup = frozenset(['Compute', 'Flops', 'HPC', 'Pipeline', 'Server']) - maxval = None - def compute(self, EV): - try: - self.val = EV("AMX_OPS_RETIRED.BF16:c1", 4) / Retired_Slots(self, EV, 4) - self.thresh = (self.val > 0.1) and self.parent.thresh - except ZeroDivisionError: - handle_error(self, "FP_AMX zero division") - return self.val - desc = """ -This metric approximates arithmetic floating-point (FP) -matrix uops fraction the CPU has retired (aggregated across -all supported FP datatypes in AMX engine). Refer to AMX_Busy -and GFLOPs metrics for actual AMX utilization and FP -performance, resp.""" - - class Int_Operations: name = "Int_Operations" domain = "Uops" @@ -3135,35 +3164,9 @@ def compute(self, EV): handle_error(self, "Int_Vector_256b zero division") return self.val desc = """ -This metric represents 256-bit vector Integer ADD/SUB/SAD or -VNNI (Vector Neural Network Instructions) uops fraction the -CPU has retired.""" - - -class Int_AMX: - name = "Int_AMX" - domain = "Uops_Estimated" - area = "RET" - level = 4 - htoff = False - sample = [] - errcount = 0 - sibling = None - metricgroup = frozenset(['Compute', 'HPC', 'IntVector', 'Pipeline', 'Server']) - maxval = None - def compute(self, EV): - try: - self.val = EV("AMX_OPS_RETIRED.INT8:c1", 4) / Retired_Slots(self, EV, 4) - self.thresh = (self.val > 0.1) and self.parent.thresh - except ZeroDivisionError: - handle_error(self, "Int_AMX zero division") - return self.val - desc = """ -This metric approximates arithmetic Integer (Int) matrix -uops fraction the CPU has retired (aggregated across all -supported Int datatypes in AMX engine). Refer to AMX_Busy -and TIOPs metrics for actual AMX utilization and Int -performance, resp.""" +This metric represents 256-bit vector Integer +ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) +uops fraction the CPU has retired.""" class Memory_Operations: @@ -3199,7 +3202,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3211,8 +3214,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -3225,7 +3228,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3275,7 +3278,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3323,14 +3326,14 @@ class Heavy_Operations: area = "RET" level = 2 htoff = False - sample = [] + sample = ['UOPS_RETIRED.HEAVY'] errcount = 0 sibling = None metricgroup = frozenset(['Retire', 'TmaL2']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) except ZeroDivisionError: handle_error(self, "Heavy_Operations zero division") @@ -3405,7 +3408,7 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3518,7 +3521,7 @@ class CISC: area = "RET" level = 4 htoff = False - sample = [] + sample = ['FRONTEND_RETIRED.MS_FLOWS'] errcount = 0 sibling = None metricgroup = frozenset([]) @@ -3546,7 +3549,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3566,7 +3569,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3587,7 +3590,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3598,7 +3601,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3607,7 +3611,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3627,7 +3631,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3647,7 +3651,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3667,7 +3671,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3677,7 +3681,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3688,7 +3692,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3709,7 +3713,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3731,7 +3735,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3741,10 +3745,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3753,7 +3756,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3763,31 +3766,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3842,7 +3868,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -4024,7 +4050,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -4046,26 +4073,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -4182,7 +4189,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -4244,8 +4251,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_HP: @@ -4265,8 +4273,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_HP zero division") desc = """ Instructions per FP Arithmetic Scalar Half-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_SP: @@ -4286,8 +4295,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -4307,8 +4317,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -4328,8 +4339,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -4349,50 +4360,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" - - -class Metric_IpArith_AMX_F16: - name = "IpArith_AMX_F16" - domain = "Inst_Metric" - maxval = 0 - errcount = 0 - area = "Info.Inst_Mix" - metricgroup = frozenset(['Flops', 'FpVector', 'InsType', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = IpArith_AMX_F16(self, EV, 0) - self.thresh = (self.val < 10) - except ZeroDivisionError: - handle_error_metric(self, "IpArith_AMX_F16 zero division") - desc = """ -Instructions per FP Arithmetic AMX operation (lower number -means higher occurrence rate). Operations factored per -matrices' sizes of the AMX instructions.""" - - -class Metric_IpArith_AMX_Int8: - name = "IpArith_AMX_Int8" - domain = "Inst_Metric" - maxval = 0 - errcount = 0 - area = "Info.Inst_Mix" - metricgroup = frozenset(['IntVector', 'InsType', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = IpArith_AMX_Int8(self, EV, 0) - self.thresh = (self.val < 10) - except ZeroDivisionError: - handle_error_metric(self, "IpArith_AMX_Int8 zero division") - desc = """ -Instructions per Integer Arithmetic AMX operation (lower -number means higher occurrence rate). Operations factored -per matrices' sizes of the AMX instructions.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -4518,7 +4487,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4532,8 +4501,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4639,6 +4664,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4794,7 +4839,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4814,7 +4859,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4834,7 +4879,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4854,7 +4899,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4895,7 +4940,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -5136,6 +5181,26 @@ def compute(self, EV): loads (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -5233,8 +5298,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -5253,8 +5317,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -5273,8 +5336,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -5293,8 +5355,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5699,7 +5760,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5718,7 +5779,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5791,45 +5852,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" - - -class Metric_GIOPs: - name = "GIOPs" - domain = "Metric" - maxval = 0 - errcount = 0 - area = "Info.System" - metricgroup = frozenset(['Cor', 'HPC', 'IntVector', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = GIOPs(self, EV, 0) - self.thresh = True - except ZeroDivisionError: - handle_error_metric(self, "GIOPs zero division") - desc = """ -Giga Integer (matrix) Operations Per Second""" - - -class Metric_TIOPs: - name = "TIOPs" - domain = "Metric" - maxval = 0 - errcount = 0 - area = "Info.System" - metricgroup = frozenset(['HPC', 'IntVector', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = TIOPs(self, EV, 0) - self.thresh = True - except ZeroDivisionError: - handle_error_metric(self, "TIOPs zero division") - desc = """ -Tera Integer (matrix) Operations Per Second""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -6312,6 +6335,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = FB_Full() ; r.run(n) ; o["FB_Full"] = n @@ -6366,11 +6390,9 @@ def __init__(self, r): n = FP_Vector() ; r.run(n) ; o["FP_Vector"] = n n = FP_Vector_128b() ; r.run(n) ; o["FP_Vector_128b"] = n n = FP_Vector_256b() ; r.run(n) ; o["FP_Vector_256b"] = n - n = FP_AMX() ; r.run(n) ; o["FP_AMX"] = n n = Int_Operations() ; r.run(n) ; o["Int_Operations"] = n n = Int_Vector_128b() ; r.run(n) ; o["Int_Vector_128b"] = n n = Int_Vector_256b() ; r.run(n) ; o["Int_Vector_256b"] = n - n = Int_AMX() ; r.run(n) ; o["Int_AMX"] = n n = Memory_Operations() ; r.run(n) ; o["Memory_Operations"] = n n = Fused_Instructions() ; r.run(n) ; o["Fused_Instructions"] = n n = Non_Fused_Branches() ; r.run(n) ; o["Non_Fused_Branches"] = n @@ -6412,6 +6434,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["FB_Full"].parent = o["L1_Bound"] @@ -6465,11 +6488,9 @@ def __init__(self, r): o["FP_Vector"].parent = o["FP_Arith"] o["FP_Vector_128b"].parent = o["FP_Vector"] o["FP_Vector_256b"].parent = o["FP_Vector"] - o["FP_AMX"].parent = o["FP_Arith"] o["Int_Operations"].parent = o["Light_Operations"] o["Int_Vector_128b"].parent = o["Int_Operations"] o["Int_Vector_256b"].parent = o["Int_Operations"] - o["Int_AMX"].parent = o["Int_Operations"] o["Memory_Operations"].parent = o["Light_Operations"] o["Fused_Instructions"].parent = o["Light_Operations"] o["Non_Fused_Branches"].parent = o["Light_Operations"] @@ -6498,7 +6519,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -6512,7 +6534,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -6527,8 +6548,6 @@ def __init__(self, r): n = Metric_IpArith_Scalar_DP() ; r.metric(n) ; o["IpArith_Scalar_DP"] = n n = Metric_IpArith_AVX128() ; r.metric(n) ; o["IpArith_AVX128"] = n n = Metric_IpArith_AVX256() ; r.metric(n) ; o["IpArith_AVX256"] = n - n = Metric_IpArith_AMX_F16() ; r.metric(n) ; o["IpArith_AMX_F16"] = n - n = Metric_IpArith_AMX_Int8() ; r.metric(n) ; o["IpArith_AMX_Int8"] = n n = Metric_IpPause() ; r.metric(n) ; o["IpPause"] = n n = Metric_IpSWPF() ; r.metric(n) ; o["IpSWPF"] = n n = Metric_Instructions() ; r.metric(n) ; o["Instructions"] = n @@ -6536,11 +6555,15 @@ def __init__(self, r): n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -6566,6 +6589,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -6599,8 +6623,6 @@ def __init__(self, r): n = Metric_Core_Frequency() ; r.metric(n) ; o["Core_Frequency"] = n n = Metric_Uncore_Frequency() ; r.metric(n) ; o["Uncore_Frequency"] = n n = Metric_GFLOPs() ; r.metric(n) ; o["GFLOPs"] = n - n = Metric_GIOPs() ; r.metric(n) ; o["GIOPs"] = n - n = Metric_TIOPs() ; r.metric(n) ; o["TIOPs"] = n n = Metric_Turbo_Utilization() ; r.metric(n) ; o["Turbo_Utilization"] = n n = Metric_SMT_2T_Utilization() ; r.metric(n) ; o["SMT_2T_Utilization"] = n n = Metric_Kernel_Utilization() ; r.metric(n) ; o["Kernel_Utilization"] = n @@ -6668,23 +6690,20 @@ def __init__(self, r): o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] - o["FP_Arith"].FP_Vector = o["FP_Vector"] - o["FP_Arith"].FP_Scalar = o["FP_Scalar"] - o["FP_Arith"].FP_AMX = o["FP_AMX"] o["FP_Arith"].Retiring = o["Retiring"] + o["FP_Arith"].FP_Scalar = o["FP_Scalar"] o["FP_Arith"].X87_Use = o["X87_Use"] + o["FP_Arith"].FP_Vector = o["FP_Vector"] o["X87_Use"].Retiring = o["Retiring"] o["FP_Scalar"].Retiring = o["Retiring"] o["FP_Vector"].Retiring = o["Retiring"] o["FP_Vector_128b"].Retiring = o["Retiring"] o["FP_Vector_256b"].Retiring = o["Retiring"] - o["FP_AMX"].Retiring = o["Retiring"] o["Int_Operations"].Retiring = o["Retiring"] o["Int_Operations"].Int_Vector_256b = o["Int_Vector_256b"] o["Int_Operations"].Int_Vector_128b = o["Int_Vector_128b"] o["Int_Vector_128b"].Retiring = o["Retiring"] o["Int_Vector_256b"].Retiring = o["Retiring"] - o["Int_AMX"].Retiring = o["Retiring"] o["Memory_Operations"].Retiring = o["Retiring"] o["Memory_Operations"].Light_Operations = o["Light_Operations"] o["Memory_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -6695,17 +6714,16 @@ def __init__(self, r): o["Non_Fused_Branches"].Light_Operations = o["Light_Operations"] o["Non_Fused_Branches"].Heavy_Operations = o["Heavy_Operations"] o["Other_Light_Ops"].Light_Operations = o["Light_Operations"] - o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"] o["Other_Light_Ops"].Retiring = o["Retiring"] o["Other_Light_Ops"].Heavy_Operations = o["Heavy_Operations"] o["Other_Light_Ops"].Int_Operations = o["Int_Operations"] o["Other_Light_Ops"].Non_Fused_Branches = o["Non_Fused_Branches"] - o["Other_Light_Ops"].FP_AMX = o["FP_AMX"] + o["Other_Light_Ops"].FP_Arith = o["FP_Arith"] o["Other_Light_Ops"].Fused_Instructions = o["Fused_Instructions"] o["Other_Light_Ops"].Int_Vector_128b = o["Int_Vector_128b"] o["Other_Light_Ops"].FP_Vector = o["FP_Vector"] + o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"] o["Other_Light_Ops"].X87_Use = o["X87_Use"] - o["Other_Light_Ops"].FP_Arith = o["FP_Arith"] o["Other_Light_Ops"].Int_Vector_256b = o["Int_Vector_256b"] o["Other_Light_Ops"].Memory_Operations = o["Memory_Operations"] o["Nop_Instructions"].Retiring = o["Retiring"] @@ -6764,16 +6782,17 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"] - o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] @@ -6799,13 +6818,13 @@ def __init__(self, r): o["Cache_Memory_Latency"].HBM_Bound = o["HBM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -6814,6 +6833,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].HBM_Bound = o["HBM_Bound"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Synchronization"].L1_Bound = o["L1_Bound"] o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"] @@ -6894,7 +6914,7 @@ def __init__(self, r): o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] - o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] @@ -6914,6 +6934,7 @@ def __init__(self, r): o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] + o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] @@ -6938,19 +6959,19 @@ def __init__(self, r): o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] o["Other_Bottlenecks"].HBM_Bound = o["HBM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].MITE = o["MITE"] o["DSB_Misses"].LCP = o["LCP"] @@ -6964,6 +6985,11 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -7041,5 +7067,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/spr_server_ratios.py b/spr_server_ratios.py index af9842ce..08631bba 100644 --- a/spr_server_ratios.py +++ b/spr_server_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full description for Intel Xeon Scalable Processors 4th gen (code name Sapphire Rapids) +# auto generated TopDown/TMA 4.8-full-perf description for Intel Xeon Scalable Processors 4th gen (code name Sapphire Rapids) # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,11 +16,14 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 1 Average_Frequency = 0.0 - +num_cores = 1 +num_threads = 1 +num_sockets = 1 +topdown_use_fixed = False def handle_error(obj, msg): print_error(msg) @@ -46,7 +49,13 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 -Errata_Whitelist = "SPR103" +Errata_Whitelist = "SPR121;SPR103" +Memory = 0 +PMM_App_Direct = 1 if Memory == 1 else 0 +HBM = 1 if Memory > 1 else 0 +PERF_METRICS_MSR = 1 +FP16 = 1 +DS = 1 # Aux. formulas @@ -55,7 +64,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.AMX_Busy.compute(EV) + self.Divider.compute(EV)) @@ -96,15 +105,29 @@ def Few_Uops_Executed_Threshold(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FLOP_Count(self, EV, level): - return (1 *(EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level)) + 2 *(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level)) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 *(EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.8_FLOPS", level)) + 16 *(EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) + 32 * EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level) + 4 * EV("AMX_OPS_RETIRED.BF16", level)) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level) + EV("FP_ARITH_INST_RETIRED.8_FLOPS", level) + EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + return (1 *(EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR_HALF", level)) + 2 *(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", level)) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 *(EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.8_FLOPS", level)) + 16 *(EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) + 32 * EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)) if FP16 else(1 * EV("FP_ARITH_INST_RETIRED.SCALAR", level) + 2 * EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + 4 * EV("FP_ARITH_INST_RETIRED.4_FLOPS", level) + 8 * EV("FP_ARITH_INST_RETIRED.8_FLOPS", level) + 16 * EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED.SCALAR", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0xfc", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) + EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) + EV("FP_ARITH_INST_RETIRED2.VECTOR", level) if FP16 else EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -120,19 +143,24 @@ def LOAD_LCL_MEM(self, EV, level): return EV("MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) def LOAD_LCL_PMM(self, EV, level): - return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) + return EV("MEM_LOAD_RETIRED.LOCAL_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_FWD(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_HITM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_MEM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_RMT_PMM(self, EV, level): - return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) + return EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", level) * (1 + FBHit_per_L1Miss(self, EV, level)) if DS else 0 def LOAD_XSNP_HIT(self, EV, level): return EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", level) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", level) * (1 - True_XSNP_HitM_Fraction(self, EV, level)) @@ -147,7 +175,7 @@ def MEM_Bound_Ratio(self, EV, level): return EV("MEMORY_ACTIVITY.STALLS_L3_MISS", level) / CLKS(self, EV, level) def Mem_DDR_Hit_Fraction(self, EV, level): - return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) + return (19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) / ((19 * LOAD_RMT_MEM(self, EV, level) + 10 *(LOAD_LCL_MEM(self, EV, level) + LOAD_RMT_FWD(self, EV, level) + LOAD_RMT_HITM(self, EV, level))) + (25 * LOAD_LCL_PMM(self, EV, level) + 33 * LOAD_RMT_PMM(self, EV, level))) if DS else 1 def Mem_Lock_St_Fraction(self, EV, level): return EV("MEM_INST_RETIRED.LOCK_LOADS", level) / EV("MEM_INST_RETIRED.ALL_STORES", level) @@ -171,28 +199,28 @@ def True_XSNP_HitM_Fraction(self, EV, level): return EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", level) / (EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", level) + EV("OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", level)) def Mem_XSNP_HitM_Cost(self, EV, level): - return 80 * Core_Frequency(self, EV, level) + return 81 * Core_Frequency(self, EV, level) def Mem_XSNP_Hit_Cost(self, EV, level): - return 79.5 * Core_Frequency(self, EV, level) + return 79 * Core_Frequency(self, EV, level) def Mem_XSNP_None_Cost(self, EV, level): return 37 * Core_Frequency(self, EV, level) def Mem_Local_DRAM_Cost(self, EV, level): - return 108 * Core_Frequency(self, EV, level) + return 109 * Core_Frequency(self, EV, level) def Mem_Remote_DRAM_Cost(self, EV, level): - return 186 * Core_Frequency(self, EV, level) + return 190 * Core_Frequency(self, EV, level) def Mem_Remote_HitM_Cost(self, EV, level): - return 172.5 * Core_Frequency(self, EV, level) + return 170 * Core_Frequency(self, EV, level) def Mem_Remote_Fwd_Cost(self, EV, level): - return 172.5 * Core_Frequency(self, EV, level) + return 170 * Core_Frequency(self, EV, level) def Mem_L2_Hit_Cost(self, EV, level): - return 4 * Core_Frequency(self, EV, level) + return 4.4 * Core_Frequency(self, EV, level) def PERF_METRICS_SUM(self, EV, level): return (EV("PERF_METRICS.FRONTEND_BOUND", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.BAD_SPECULATION", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.RETIRING", level) / EV("TOPDOWN.SLOTS", level)) + (EV("PERF_METRICS.BACKEND_BOUND", level) / EV("TOPDOWN.SLOTS", level)) @@ -206,13 +234,11 @@ def Retired_Slots(self, EV, level): # Number of logical processors (enabled or online) on the target system def Num_CPUs(self, EV, level): - return 224 if smt_enabled else 112 - -def Memory(self, EV, level): - return 1 + return num_cores * num_sockets * num_threads if num_cores else 224 /(2 - smt_enabled ) -def PMM_App_Direct(self, EV, level): - return 1 if Memory(self, EV, level)== 1 else 0 +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -226,7 +252,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -234,23 +260,23 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / max(self.Memory_Bound.compute(EV) , (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV)))) * (self.DTLB_Load.compute(EV) / max(self.L1_Bound.compute(EV) , (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val -# Total pipeline cost of Memory Synchornization related bottlenecks (data transfers and coherency updates across processors) +# Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors) def Memory_Synchronization(self, EV, level): val = 100 *(self.Memory_Bound.compute(EV) * ((self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * (self.Contested_Accesses.compute(EV) + self.Data_Sharing.compute(EV)) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)) + (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.PMM_Bound.compute(EV) + self.L2_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV))) * self.False_Sharing.compute(EV) / ((self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)) - self.Store_Latency.compute(EV))) + self.Machine_Clears.compute(EV) * (1 - self.Other_Nukes.compute(EV) / (self.Other_Nukes.compute(EV)))) self.thresh = (val > 10) @@ -268,24 +294,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -296,7 +328,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -312,7 +344,7 @@ def CLKS(self, EV, level): # Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward) def SLOTS(self, EV, level): - return EV("TOPDOWN.SLOTS", level) + return EV("TOPDOWN.SLOTS", level) if topdown_use_fixed else EV("TOPDOWN.SLOTS", level) # Fraction of Physical Core issue-slots utilized by this Logical Processor def Slots_Utilization(self, EV, level): @@ -334,7 +366,7 @@ def FLOPc(self, EV, level): def FP_Arith_Utilization(self, EV, level): return (EV("FP_ARITH_DISPATCHED.PORT_0", level) + EV("FP_ARITH_DISPATCHED.PORT_1", level) + EV("FP_ARITH_DISPATCHED.PORT_5", level)) / (2 * CORE_CLKS(self, EV, level)) -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per logical-processor +# Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor) def ILP(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / EV("UOPS_EXECUTED.THREAD:c1", level) @@ -342,12 +374,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -372,11 +398,11 @@ def IpBranch(self, EV, level): # Instructions per (near) call (lower number means higher occurrence rate) def IpCall(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("CPU_CLK_UNHALTED.NEAR_CALL", level) + val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_CALL", level) self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -392,57 +418,59 @@ def IpFLOP(self, EV, level): self.thresh = (val < 10) return val -# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW. +# Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW. def IpArith(self, EV, level): val = EV("INST_RETIRED.ANY", level) / (FP_Arith_Scalar(self, EV, level) + FP_Arith_Vector(self, EV, level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_HP(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("FP_ARITH_INST_RETIRED2.SCALAR", level) + EV("INST_RETIRED.ANY", level) + val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED2.SCALAR", level) if FP16 else 0 self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_SP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_Scalar_DP(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", level) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX128(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("INST_RETIRED.ANY", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level) + val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX256(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("INST_RETIRED.ANY", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val -# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. +# Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. def IpArith_AVX512(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)) - self.thresh = (val < 10) - return val - -# Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions. -def IpArith_AMX_F16(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.BF16", level) - self.thresh = (val < 10) - return val - -# Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions. -def IpArith_AMX_Int8(self, EV, level): - val = EV("INST_RETIRED.ANY", level) / EV("AMX_OPS_RETIRED.INT8", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("INST_RETIRED.ANY", level) + val = EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", level)) if FP16 else EV("INST_RETIRED.ANY", level) / (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", level) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", level)) self.thresh = (val < 10) return val @@ -476,10 +504,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -504,6 +543,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level) @@ -538,25 +583,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -566,7 +611,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -618,6 +663,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -634,19 +683,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -730,11 +775,11 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): @@ -744,18 +789,10 @@ def Core_Frequency(self, EV, level): def Uncore_Frequency(self, EV, level): return Socket_CLKS(self, EV, level) / 1e9 / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine. +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) -# Giga Integer (matrix) Operations Per Second -def GIOPs(self, EV, level): - return 8 * EV("AMX_OPS_RETIRED.INT8", level) / 1e9 / Time(self, EV, level) - -# Tera Integer (matrix) Operations Per Second -def TIOPs(self, EV, level): - return GIOPs(self, EV, level) / 1000 - # Average Frequency Utilization relative nominal frequency def Turbo_Utilization(self, EV, level): return CLKS(self, EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) @@ -806,7 +843,9 @@ def MEM_Parallel_Reads(self, EV, level): # Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_PMM_Read_Latency(self, EV, level): - return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) + EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) + EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level) + return (OneBillion *(EV("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM", level) / EV("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM", level)) / EV("UNC_CHA_CLOCKTICKS:one_unit", level)) if PMM_App_Direct else 0 # Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches def MEM_DRAM_Read_Latency(self, EV, level): @@ -814,11 +853,11 @@ def MEM_DRAM_Read_Latency(self, EV, level): # Average 3DXP Memory Bandwidth Use for reads [GB / sec] def PMM_Read_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_RPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average 3DXP Memory Bandwidth Use for Writes [GB / sec] def PMM_Write_BW(self, EV, level): - return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) + return ((64 * EV("UNC_M_PMM_WPQ_INSERTS", level) / OneBillion) / Time(self, EV, level)) if PMM_App_Direct else 0 # Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU def IO_Read_BW(self, EV, level): @@ -860,11 +899,11 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) + self.val = (EV("PERF_METRICS.FRONTEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) - EV("INT_MISC.UOP_DROPPING", 1) / SLOTS(self, EV, 1) if topdown_use_fixed else(EV("IDQ_BUBBLES.CORE", 1) - EV("INT_MISC.UOP_DROPPING", 1)) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.15) except ZeroDivisionError: handle_error(self, "Frontend_Bound zero division") @@ -899,7 +938,7 @@ class Fetch_Latency: maxval = None def compute(self, EV): try: - self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) + self.val = ((EV("PERF_METRICS.FETCH_LATENCY", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) - EV("INT_MISC.UOP_DROPPING", 2) / SLOTS(self, EV, 2)) if topdown_use_fixed else(EV("IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", 2) * Pipeline_Width - EV("INT_MISC.UOP_DROPPING", 2)) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Fetch_Latency zero division") @@ -922,7 +961,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -947,7 +986,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -1004,7 +1043,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -1052,7 +1091,7 @@ class Unknown_Branches: sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -1306,11 +1345,11 @@ class Branch_Mispredicts: sample = ['TOPDOWN.BR_MISPREDICT_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.BRANCH_MISPREDICTS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.BR_MISPREDICT_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Branch_Mispredicts zero division") @@ -1337,7 +1376,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1361,7 +1400,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1391,7 +1430,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1415,11 +1454,11 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.BACKEND_BOUND", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("TOPDOWN.BACKEND_BOUND_SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.2) except ZeroDivisionError: handle_error(self, "Backend_Bound zero division") @@ -1451,7 +1490,7 @@ class Memory_Bound: maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.MEMORY_BOUND", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("TOPDOWN.MEMORY_BOUND_SLOTS", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Memory_Bound zero division") @@ -1507,8 +1546,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = min(Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT:c1", 4) + EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 4) , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) @@ -1539,7 +1578,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV) @@ -1563,7 +1602,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1608,13 +1647,38 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) @@ -1667,7 +1731,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMS', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1696,7 +1760,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1746,7 +1810,7 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1774,7 +1838,7 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -1801,7 +1865,7 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) + metricgroup = frozenset(['BvML', 'MemoryLat']) maxval = 1.0 def compute(self, EV): try: @@ -1829,7 +1893,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1877,7 +1941,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMS', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1939,7 +2003,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1965,7 +2029,7 @@ class Local_MEM: area = "BE/Mem" level = 5 htoff = False - sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM:pp'] + sample = ['MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM'] errcount = 0 sibling = None metricgroup = frozenset(['Server']) @@ -1996,7 +2060,8 @@ class Remote_MEM: maxval = 1.0 def compute(self, EV): try: - self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = (Mem_Remote_DRAM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_MEM zero division") @@ -2020,7 +2085,9 @@ class Remote_Cache: maxval = 1.0 def compute(self, EV): try: - self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) + self.val = ((Mem_Remote_HitM_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + (Mem_Remote_Fwd_Cost(self, EV, 5) - Mem_XSNP_None_Cost(self, EV, 5)) * EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5)) * FB_Factor(self, EV, 5) / CLKS(self, EV, 5) if DS else 0 + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM", 5) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD", 5) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Remote_Cache zero division") @@ -2045,10 +2112,10 @@ class PMM_Bound: maxval = 1.0 def compute(self, EV): try: - self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0 ) - EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + self.val = (((1 - Mem_DDR_Hit_Fraction(self, EV, 3)) * MEM_Bound_Ratio(self, EV, 3)) if (OneMillion *(EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) + EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3))> EV("MEM_LOAD_RETIRED.L1_MISS", 3)) else 0) if PMM_App_Direct else 0 EV("MEM_LOAD_RETIRED.LOCAL_PMM", 3) EV("MEM_LOAD_RETIRED.L1_MISS", 3) + EV("MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", 3) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "PMM_Bound zero division") @@ -2097,7 +2164,7 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = 1.0 def compute(self, EV): try: @@ -2125,7 +2192,7 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) maxval = 1.0 def compute(self, EV): try: @@ -2180,7 +2247,8 @@ class Streaming_Stores: maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -2204,7 +2272,7 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) + metricgroup = frozenset(['BvMT', 'MemoryTLB']) maxval = 1.0 def compute(self, EV): try: @@ -2312,8 +2380,8 @@ class Divider: sample = ['ARITH.DIVIDER_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2338,7 +2406,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2457,7 +2525,7 @@ class AMX_Busy: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Compute', 'HPC', 'Server']) + metricgroup = frozenset(['BvCB', 'Compute', 'HPC', 'Server']) maxval = None def compute(self, EV): try: @@ -2521,7 +2589,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = (EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY:u1", 4)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) + self.val = (EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY:u1", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2631,7 +2699,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2794,11 +2862,11 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) + self.val = (EV("PERF_METRICS.RETIRING", 1) / EV("TOPDOWN.SLOTS", 1)) / PERF_METRICS_SUM(self, EV, 1) if topdown_use_fixed else EV("UOPS_RETIRED.SLOTS", 1) / SLOTS(self, EV, 1) self.thresh = (self.val > 0.7) or self.Heavy_Operations.thresh except ZeroDivisionError: handle_error(self, "Retiring zero division") @@ -2868,7 +2936,7 @@ class FP_Arith: maxval = None def compute(self, EV): try: - self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) + self.FP_AMX.compute(EV) + self.val = self.X87_Use.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Arith zero division") @@ -2971,7 +3039,10 @@ class FP_Vector_128b: maxval = 1.0 def compute(self, EV): try: - self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) + self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Vector_128b zero division") @@ -2996,7 +3067,10 @@ class FP_Vector_256b: maxval = 1.0 def compute(self, EV): try: - self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) + self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Vector_256b zero division") @@ -3021,7 +3095,10 @@ class FP_Vector_512b: maxval = 1.0 def compute(self, EV): try: - self.val = (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) + self.val = (EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5)) / Retired_Slots(self, EV, 5) if FP16 else(EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", 5) + EV("FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", 5) + EV("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", 5) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "FP_Vector_512b zero division") @@ -3032,32 +3109,6 @@ def compute(self, EV): due to FMA double counting.""" -class FP_AMX: - name = "FP_AMX" - domain = "Uops_Estimated" - area = "RET" - level = 4 - htoff = False - sample = [] - errcount = 0 - sibling = None - metricgroup = frozenset(['Compute', 'Flops', 'HPC', 'Pipeline', 'Server']) - maxval = None - def compute(self, EV): - try: - self.val = EV("AMX_OPS_RETIRED.BF16:c1", 4) / Retired_Slots(self, EV, 4) - self.thresh = (self.val > 0.1) and self.parent.thresh - except ZeroDivisionError: - handle_error(self, "FP_AMX zero division") - return self.val - desc = """ -This metric approximates arithmetic floating-point (FP) -matrix uops fraction the CPU has retired (aggregated across -all supported FP datatypes in AMX engine). Refer to AMX_Busy -and GFLOPs metrics for actual AMX utilization and FP -performance, resp.""" - - class Int_Operations: name = "Int_Operations" domain = "Uops" @@ -3071,7 +3122,7 @@ class Int_Operations: maxval = None def compute(self, EV): try: - self.val = self.Int_Vector_128b.compute(EV) + self.Int_Vector_256b.compute(EV) + self.Int_AMX.compute(EV) + self.val = self.Int_Vector_128b.compute(EV) + self.Int_Vector_256b.compute(EV) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Int_Operations zero division") @@ -3127,35 +3178,9 @@ def compute(self, EV): handle_error(self, "Int_Vector_256b zero division") return self.val desc = """ -This metric represents 256-bit vector Integer ADD/SUB/SAD or -VNNI (Vector Neural Network Instructions) uops fraction the -CPU has retired.""" - - -class Int_AMX: - name = "Int_AMX" - domain = "Uops_Estimated" - area = "RET" - level = 4 - htoff = False - sample = [] - errcount = 0 - sibling = None - metricgroup = frozenset(['Compute', 'HPC', 'IntVector', 'Pipeline', 'Server']) - maxval = None - def compute(self, EV): - try: - self.val = EV("AMX_OPS_RETIRED.INT8:c1", 4) / Retired_Slots(self, EV, 4) - self.thresh = (self.val > 0.1) and self.parent.thresh - except ZeroDivisionError: - handle_error(self, "Int_AMX zero division") - return self.val - desc = """ -This metric approximates arithmetic Integer (Int) matrix -uops fraction the CPU has retired (aggregated across all -supported Int datatypes in AMX engine). Refer to AMX_Busy -and TIOPs metrics for actual AMX utilization and Int -performance, resp.""" +This metric represents 256-bit vector Integer +ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) +uops fraction the CPU has retired.""" class Memory_Operations: @@ -3191,7 +3216,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3203,8 +3228,8 @@ def compute(self, EV): desc = """ This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent -multiple contiguous instructions. The instruction pairs of -CMP+JCC or DEC+JCC are commonly used examples.. See section +multiple contiguous instructions. CMP+JCC or DEC+JCC are +common examples of legacy fusions. {}. See section 'Optimizing for Macro-fusion' in Optimization Manual:""" @@ -3217,7 +3242,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3267,7 +3292,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3322,7 +3347,7 @@ class Heavy_Operations: maxval = None def compute(self, EV): try: - self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) + self.val = (EV("PERF_METRICS.HEAVY_OPERATIONS", 2) / EV("TOPDOWN.SLOTS", 2)) / PERF_METRICS_SUM(self, EV, 2) if topdown_use_fixed else EV("UOPS_RETIRED.HEAVY", 2) / SLOTS(self, EV, 2) self.thresh = (self.val > 0.1) except ZeroDivisionError: handle_error(self, "Heavy_Operations zero division") @@ -3397,7 +3422,7 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) + metricgroup = frozenset(['BvIO']) maxval = 1.0 def compute(self, EV): try: @@ -3538,7 +3563,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3558,7 +3583,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3579,7 +3604,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3590,7 +3615,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3599,7 +3625,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3619,7 +3645,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3639,7 +3665,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3659,7 +3685,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3669,7 +3695,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Memory_Synchronization zero division") desc = """ -Total pipeline cost of Memory Synchornization related +Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)""" @@ -3680,7 +3706,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3701,7 +3727,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3723,7 +3749,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3733,10 +3759,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3745,7 +3770,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3755,31 +3780,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3834,7 +3882,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -4016,7 +4064,8 @@ def compute(self, EV): handle_error_metric(self, "ILP zero division") desc = """ Instruction-Level-Parallelism (average number of uops -executed when there is execution) per logical-processor""" +executed when there is execution) per thread (logical- +processor)""" class Metric_EPC: @@ -4038,26 +4087,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1.0 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -4174,7 +4203,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -4236,8 +4265,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith zero division") desc = """ Instructions per FP Arithmetic instruction (lower number -means higher occurrence rate). May undercount due to FMA -double counting. Approximated prior to BDW.""" +means higher occurrence rate). Values < 1 are possible due +to intentional FMA double counting. Approximated prior to +BDW.""" class Metric_IpArith_Scalar_HP: @@ -4257,8 +4287,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_HP zero division") desc = """ Instructions per FP Arithmetic Scalar Half-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_SP: @@ -4278,8 +4309,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_SP zero division") desc = """ Instructions per FP Arithmetic Scalar Single-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_Scalar_DP: @@ -4299,8 +4331,9 @@ def compute(self, EV): handle_error_metric(self, "IpArith_Scalar_DP zero division") desc = """ Instructions per FP Arithmetic Scalar Double-Precision -instruction (lower number means higher occurrence rate). May -undercount due to FMA double counting.""" +instruction (lower number means higher occurrence rate). +Values < 1 are possible due to intentional FMA double +counting.""" class Metric_IpArith_AVX128: @@ -4320,8 +4353,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX128 zero division") desc = """ Instructions per FP Arithmetic AVX/SSE 128-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX256: @@ -4341,8 +4374,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX256 zero division") desc = """ Instructions per FP Arithmetic AVX* 256-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpArith_AVX512: @@ -4362,50 +4395,8 @@ def compute(self, EV): handle_error_metric(self, "IpArith_AVX512 zero division") desc = """ Instructions per FP Arithmetic AVX 512-bit instruction -(lower number means higher occurrence rate). May undercount -due to FMA double counting.""" - - -class Metric_IpArith_AMX_F16: - name = "IpArith_AMX_F16" - domain = "Inst_Metric" - maxval = 0 - errcount = 0 - area = "Info.Inst_Mix" - metricgroup = frozenset(['Flops', 'FpVector', 'InsType', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = IpArith_AMX_F16(self, EV, 0) - self.thresh = (self.val < 10) - except ZeroDivisionError: - handle_error_metric(self, "IpArith_AMX_F16 zero division") - desc = """ -Instructions per FP Arithmetic AMX operation (lower number -means higher occurrence rate). Operations factored per -matrices' sizes of the AMX instructions.""" - - -class Metric_IpArith_AMX_Int8: - name = "IpArith_AMX_Int8" - domain = "Inst_Metric" - maxval = 0 - errcount = 0 - area = "Info.Inst_Mix" - metricgroup = frozenset(['IntVector', 'InsType', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = IpArith_AMX_Int8(self, EV, 0) - self.thresh = (self.val < 10) - except ZeroDivisionError: - handle_error_metric(self, "IpArith_AMX_Int8 zero division") - desc = """ -Instructions per Integer Arithmetic AMX operation (lower -number means higher occurrence rate). Operations factored -per matrices' sizes of the AMX instructions.""" +(lower number means higher occurrence rate). Values < 1 are +possible due to intentional FMA double counting.""" class Metric_IpPause: @@ -4531,7 +4522,7 @@ def compute(self, EV): class Metric_Execute: name = "Execute" - domain = "Core_Metric" + domain = "Metric" maxval = Exe_Ports errcount = 0 area = "Info.Pipeline" @@ -4545,8 +4536,64 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: @@ -4652,6 +4699,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4807,7 +4874,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4827,7 +4894,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4847,7 +4914,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4867,7 +4934,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4908,7 +4975,7 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: @@ -5149,6 +5216,26 @@ def compute(self, EV): loads (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -5246,8 +5333,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -5266,8 +5352,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -5286,8 +5371,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -5306,8 +5390,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: @@ -5691,7 +5774,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5710,7 +5793,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5783,45 +5866,7 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width and AMX engine.""" - - -class Metric_GIOPs: - name = "GIOPs" - domain = "Metric" - maxval = 0 - errcount = 0 - area = "Info.System" - metricgroup = frozenset(['Cor', 'HPC', 'IntVector', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = GIOPs(self, EV, 0) - self.thresh = True - except ZeroDivisionError: - handle_error_metric(self, "GIOPs zero division") - desc = """ -Giga Integer (matrix) Operations Per Second""" - - -class Metric_TIOPs: - name = "TIOPs" - domain = "Metric" - maxval = 0 - errcount = 0 - area = "Info.System" - metricgroup = frozenset(['HPC', 'IntVector', 'Server']) - sibling = None - - def compute(self, EV): - try: - self.val = TIOPs(self, EV, 0) - self.thresh = True - except ZeroDivisionError: - handle_error_metric(self, "TIOPs zero division") - desc = """ -Tera Integer (matrix) Operations Per Second""" +instructions, vector-width""" class Metric_Turbo_Utilization: @@ -6285,6 +6330,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = FB_Full() ; r.run(n) ; o["FB_Full"] = n @@ -6339,11 +6385,9 @@ def __init__(self, r): n = FP_Vector_128b() ; r.run(n) ; o["FP_Vector_128b"] = n n = FP_Vector_256b() ; r.run(n) ; o["FP_Vector_256b"] = n n = FP_Vector_512b() ; r.run(n) ; o["FP_Vector_512b"] = n - n = FP_AMX() ; r.run(n) ; o["FP_AMX"] = n n = Int_Operations() ; r.run(n) ; o["Int_Operations"] = n n = Int_Vector_128b() ; r.run(n) ; o["Int_Vector_128b"] = n n = Int_Vector_256b() ; r.run(n) ; o["Int_Vector_256b"] = n - n = Int_AMX() ; r.run(n) ; o["Int_AMX"] = n n = Memory_Operations() ; r.run(n) ; o["Memory_Operations"] = n n = Fused_Instructions() ; r.run(n) ; o["Fused_Instructions"] = n n = Non_Fused_Branches() ; r.run(n) ; o["Non_Fused_Branches"] = n @@ -6385,6 +6429,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["FB_Full"].parent = o["L1_Bound"] @@ -6438,11 +6483,9 @@ def __init__(self, r): o["FP_Vector_128b"].parent = o["FP_Vector"] o["FP_Vector_256b"].parent = o["FP_Vector"] o["FP_Vector_512b"].parent = o["FP_Vector"] - o["FP_AMX"].parent = o["FP_Arith"] o["Int_Operations"].parent = o["Light_Operations"] o["Int_Vector_128b"].parent = o["Int_Operations"] o["Int_Vector_256b"].parent = o["Int_Operations"] - o["Int_AMX"].parent = o["Int_Operations"] o["Memory_Operations"].parent = o["Light_Operations"] o["Fused_Instructions"].parent = o["Light_Operations"] o["Non_Fused_Branches"].parent = o["Light_Operations"] @@ -6471,7 +6514,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -6485,7 +6529,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -6501,8 +6544,6 @@ def __init__(self, r): n = Metric_IpArith_AVX128() ; r.metric(n) ; o["IpArith_AVX128"] = n n = Metric_IpArith_AVX256() ; r.metric(n) ; o["IpArith_AVX256"] = n n = Metric_IpArith_AVX512() ; r.metric(n) ; o["IpArith_AVX512"] = n - n = Metric_IpArith_AMX_F16() ; r.metric(n) ; o["IpArith_AMX_F16"] = n - n = Metric_IpArith_AMX_Int8() ; r.metric(n) ; o["IpArith_AMX_Int8"] = n n = Metric_IpPause() ; r.metric(n) ; o["IpPause"] = n n = Metric_IpSWPF() ; r.metric(n) ; o["IpSWPF"] = n n = Metric_Instructions() ; r.metric(n) ; o["Instructions"] = n @@ -6510,11 +6551,15 @@ def __init__(self, r): n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -6540,6 +6585,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -6572,8 +6618,6 @@ def __init__(self, r): n = Metric_Core_Frequency() ; r.metric(n) ; o["Core_Frequency"] = n n = Metric_Uncore_Frequency() ; r.metric(n) ; o["Uncore_Frequency"] = n n = Metric_GFLOPs() ; r.metric(n) ; o["GFLOPs"] = n - n = Metric_GIOPs() ; r.metric(n) ; o["GIOPs"] = n - n = Metric_TIOPs() ; r.metric(n) ; o["TIOPs"] = n n = Metric_Turbo_Utilization() ; r.metric(n) ; o["Turbo_Utilization"] = n n = Metric_SMT_2T_Utilization() ; r.metric(n) ; o["SMT_2T_Utilization"] = n n = Metric_Kernel_Utilization() ; r.metric(n) ; o["Kernel_Utilization"] = n @@ -6639,25 +6683,21 @@ def __init__(self, r): o["Retiring"].Heavy_Operations = o["Heavy_Operations"] o["Light_Operations"].Retiring = o["Retiring"] o["Light_Operations"].Heavy_Operations = o["Heavy_Operations"] - o["FP_Arith"].FP_Vector = o["FP_Vector"] - o["FP_Arith"].FP_Scalar = o["FP_Scalar"] - o["FP_Arith"].FP_AMX = o["FP_AMX"] o["FP_Arith"].Retiring = o["Retiring"] + o["FP_Arith"].FP_Scalar = o["FP_Scalar"] o["FP_Arith"].X87_Use = o["X87_Use"] + o["FP_Arith"].FP_Vector = o["FP_Vector"] o["X87_Use"].Retiring = o["Retiring"] o["FP_Scalar"].Retiring = o["Retiring"] o["FP_Vector"].Retiring = o["Retiring"] o["FP_Vector_128b"].Retiring = o["Retiring"] o["FP_Vector_256b"].Retiring = o["Retiring"] o["FP_Vector_512b"].Retiring = o["Retiring"] - o["FP_AMX"].Retiring = o["Retiring"] o["Int_Operations"].Retiring = o["Retiring"] - o["Int_Operations"].Int_AMX = o["Int_AMX"] o["Int_Operations"].Int_Vector_256b = o["Int_Vector_256b"] o["Int_Operations"].Int_Vector_128b = o["Int_Vector_128b"] o["Int_Vector_128b"].Retiring = o["Retiring"] o["Int_Vector_256b"].Retiring = o["Retiring"] - o["Int_AMX"].Retiring = o["Retiring"] o["Memory_Operations"].Retiring = o["Retiring"] o["Memory_Operations"].Light_Operations = o["Light_Operations"] o["Memory_Operations"].Heavy_Operations = o["Heavy_Operations"] @@ -6668,18 +6708,16 @@ def __init__(self, r): o["Non_Fused_Branches"].Light_Operations = o["Light_Operations"] o["Non_Fused_Branches"].Heavy_Operations = o["Heavy_Operations"] o["Other_Light_Ops"].Light_Operations = o["Light_Operations"] - o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"] o["Other_Light_Ops"].Retiring = o["Retiring"] o["Other_Light_Ops"].Heavy_Operations = o["Heavy_Operations"] - o["Other_Light_Ops"].Int_AMX = o["Int_AMX"] o["Other_Light_Ops"].Int_Operations = o["Int_Operations"] o["Other_Light_Ops"].Non_Fused_Branches = o["Non_Fused_Branches"] - o["Other_Light_Ops"].FP_AMX = o["FP_AMX"] + o["Other_Light_Ops"].FP_Arith = o["FP_Arith"] o["Other_Light_Ops"].Fused_Instructions = o["Fused_Instructions"] o["Other_Light_Ops"].Int_Vector_128b = o["Int_Vector_128b"] o["Other_Light_Ops"].FP_Vector = o["FP_Vector"] + o["Other_Light_Ops"].FP_Scalar = o["FP_Scalar"] o["Other_Light_Ops"].X87_Use = o["X87_Use"] - o["Other_Light_Ops"].FP_Arith = o["FP_Arith"] o["Other_Light_Ops"].Int_Vector_256b = o["Int_Vector_256b"] o["Other_Light_Ops"].Memory_Operations = o["Memory_Operations"] o["Nop_Instructions"].Retiring = o["Retiring"] @@ -6737,46 +6775,53 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].PMM_Bound = o["PMM_Bound"] - o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] o["Cache_Memory_Latency"].PMM_Bound = o["PMM_Bound"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] - o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] - o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] - o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] - o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] - o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] - o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] + o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Split_Stores = o["Split_Stores"] o["Memory_Data_TLBs"].PMM_Bound = o["PMM_Bound"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -6785,6 +6830,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Synchronization"].L1_Bound = o["L1_Bound"] o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"] @@ -6864,7 +6910,7 @@ def __init__(self, r): o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] - o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] @@ -6884,6 +6930,7 @@ def __init__(self, r): o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] + o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] @@ -6907,19 +6954,19 @@ def __init__(self, r): o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].MITE = o["MITE"] o["DSB_Misses"].LCP = o["LCP"] @@ -6933,6 +6980,11 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -7011,5 +7063,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],) diff --git a/tl-tester b/tl-tester index ae774d16..64d44f15 100755 --- a/tl-tester +++ b/tl-tester @@ -15,7 +15,7 @@ # NOCPUS=1 skip individual CPU tests # MATCH=cpu in cpu loops only run for CPU cpu # ONLYCPU=cpu Only test cpu type. cur for current CPU (makes test suite run much faster) -# NOMULTIPLEX=" " Disable no multiplex +# NOMULTIPLEX=" " Disable no multiplex # NATIVE_ARGS="" Pass argument to toplevs without --force-cpu (to override on unsupported hosts) set -e @@ -121,6 +121,7 @@ SMTCPUS="${SMTCPUS:-snb jkt ivb ivt hsw hsx bdw skl bdx knl skx clx icl tgl icx NOSMTCPUS="${NOSMTCPUS:-slm simple ehl adl-grt}" ALLCPUS="${ALLCPUS:-$SMTCPUS $NOSMTCPUS}" fi +METRICCPUS='icl|tgl|mtl-rwc|spr|sprmax|adl-glc|icx' ALL=--all @@ -334,7 +335,8 @@ notfound $j fi fi -if [ $j == "icl" -o $j == "tgl" ] ; then +case "$j" in +$METRICCPUS) FORCEMETRICS=1 $WRAP ./toplev.py --force-cpu $j --filterquals --force-topology topology --force-events --print $ALL $OPT $LOAD | tee log$$ grep topdown- log$$ @@ -345,7 +347,9 @@ grep topdown- log$$ grep :D log$$ rm log$$ -fi # icl +;; +esac + done diff --git a/toplev.py b/toplev.py index d6145650..3a6e4d3a 100755 --- a/toplev.py +++ b/toplev.py @@ -3925,6 +3925,9 @@ def init_model(model, runner, pe): model.print_error = pe model.check_event = lambda ev: ectx.emap.getevent(ev) is not None model.Setup(runner) + model.num_cores = len(cpu.coreids) # includes sockets + if cpu.ht: + model.num_threads = 2 if "Errata_Whitelist" in model.__dict__: ectx.errata_whitelist += model.Errata_Whitelist.split(";")